Skip to content
Snippets Groups Projects
Commit e702bcf9 authored by blunck's avatar blunck
Browse files

Feature are now represented by classes

parent df865491
No related branches found
No related tags found
No related merge requests found
import corpus
from feature import Feature
import nltk
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
def get_phrase(i, n, tokens_only, tags_only):
#fourgram: n=4
try:
pos_sent_phrase = tokens_only[i]
neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)])
try:
if tags_only[i-1] == 'R':
pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase
except IndexError:
return (pos_sent_phrase, neg_situation_phrase)
class ContrastFeature(Feature):
"""
Class representing feature f6, based on Riloff et al. (2013)
extract-method returns a feature-vector of length 1 containing the number of
contrasts found in a review
"""
def get_feature_names(self):
return ['riloff-contrast']
def extract(self, corpus_instance):
tokens = corpus_instance['TOKENS']
tagged = nltk.pos_tag(tokens)
tags_only = [y[0] for (x,y) in tagged]
tokens_only = [x for (x,y) in tagged]
# pos sentiment phrases
verb_phrase_list = ["V"]
# only situation pos-tag combos like the following should be matched
uni_pos_list = ["V"]
bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"]
tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"]
excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N
excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"]
# generate possible pos-tag combintations
phrase_patterns = []
excl_N_phrase_patterns = []
excl_JN_phrase_patterns = []
for a in verb_phrase_list:
for b in uni_pos_list:
phrase_patterns.append(a+b)
for c in bi_pos_list:
phrase_patterns.append(a+c)
for d in tri_pos_list:
phrase_patterns.append(a+d)
for e in excl_N_tri_pos_list:
excl_N_phrase_patterns.append(a+e)
for f in excl_JN_tri_pos_list:
excl_JN_phrase_patterns.append(a+f)
contrasts = 0
candidates = []
# get all phrases matching the patterns
#TODO: elim doubles
for i in range(len(tags_only)):
fourgram = "".join(tags_only[i:(i+4)])
trigram = "".join(tags_only[i:(i+3)])
bigram = "".join(tags_only[i:(i+2)])
if fourgram in phrase_patterns:
candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
elif fourgram in excl_N_phrase_patterns:
try:
if tokens_only[i+4] != 'N':
candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
except IndexError:
candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
elif fourgram in excl_JN_phrase_patterns:
try:
if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J':
candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
except IndexError:
candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
elif trigram in phrase_patterns:
candidates.append(self.__get_phrase(i, 3, tokens_only, tags_only))
elif bigram in phrase_patterns:
candidates.append(self.__get_phrase(i, 2, tokens_only, tags_only))
# determine sentiment of extracted phrased
if candidates != []:
for phrase in candidates:
verb = phrase[0]
situation = phrase[1]
sent_verb = TextBlob(verb).sentiment.polarity
sent_situation = TextBlob(situation).sentiment.polarity
# if verb and situation are in contrast to another: increase feature value by one
if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0):
#print("phrase: {} {} sent verb: {} sent situation: {}".format(verb, situation, sent_verb, sent_situation))
contrasts += 1
return np.array([contrasts])
return (pos_sent_phrase, neg_situation_phrase)
except IndexError:
pass
def extract(corpus_instance):
tokens = corpus_instance['TOKENS']
tagged = nltk.pos_tag(tokens)
tags_only = [y[0] for (x,y) in tagged]
tokens_only = [x for (x,y) in tagged]
# pos sentiment phrases
verb_phrase_list = ["V"]
# only situation pos-tag combos like the following should be matched
uni_pos_list = ["V"]
bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"]
tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"]
excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N
excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"]
# generate possible pos-tag comintations
phrase_patterns = []
excl_N_phrase_patterns = []
excl_JN_phrase_patterns = []
for a in verb_phrase_list:
for b in uni_pos_list:
phrase_patterns.append(a+b)
for c in bi_pos_list:
phrase_patterns.append(a+c)
for d in tri_pos_list:
phrase_patterns.append(a+d)
for e in excl_N_tri_pos_list:
excl_N_phrase_patterns.append(a+e)
for f in excl_JN_tri_pos_list:
excl_JN_phrase_patterns.append(a+f)
contrasts = 0
candidates = []
# get all phrases matching the patterns
#TODO: elim doubles
for i in range(len(tags_only)):
fourgram = "".join(tags_only[i:(i+4)])
trigram = "".join(tags_only[i:(i+3)])
bigram = "".join(tags_only[i:(i+2)])
if fourgram in phrase_patterns:
candidates.append(get_phrase(i, 4, tokens_only, tags_only))
elif fourgram in excl_N_phrase_patterns:
try:
if tokens_only[i+4] != 'N':
candidates.append(get_phrase(i, 4, tokens_only, tags_only))
except IndexError:
candidates.append(get_phrase(i, 4, tokens_only, tags_only))
def __get_phrase(self, i, n, tokens_only, tags_only):
# builds phrase corresponding to the matched POS-tag-combo
try:
pos_sent_phrase = tokens_only[i]
neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)])
elif fourgram in excl_JN_phrase_patterns:
try:
if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J':
candidates.append(get_phrase(i, 4, tokens_only, tags_only))
if tags_only[i-1] == 'R':
pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase
except IndexError:
candidates.append(get_phrase(i, 4, tokens_only, tags_only))
elif trigram in phrase_patterns:
candidates.append(get_phrase(i, 3, tokens_only, tags_only))
elif bigram in phrase_patterns:
candidates.append(get_phrase(i, 2, tokens_only, tags_only))
return (pos_sent_phrase, neg_situation_phrase)
# determine sentiment of extracted phrased
if candidates != []:
for phrase in candidates:
verb = phrase[0]
situation = phrase[1]
analyser = SentimentIntensityAnalyzer()
sent_verb = analyser.polarity_scores(verb)['compound']
sent_situation = analyser.polarity_scores(situation)['compound']
if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0):
#print("phrase: {} {} sent verb: {} sent situation: {}".format(verb, situation, sent_verb, sent_situation))
contrasts += 1
return np.array([contrasts])
# if __name__ == '__main__':
# corpus = corpus.read_corpus("corpus_shuffled.csv")
# for instance in corpus:
# extract(instance)
return (pos_sent_phrase, neg_situation_phrase)
except IndexError:
pass
class Feature:
"""Class representing an abstract feature
extract():
- needs to be overwritten by subclasses
- should take a corpus instance (dict) as an arg
- should return np.array containing feature values
get_feature_names():
- needs to be overwritten by subclasses
- should return a list of feature descriptions
corresponding to feature vector
"""
def extract():
raise NotImplementedError
def get_feature_names():
raise NotImplementedError
import sent_rating_feature
import ngram_feature
import pos_feature
import punctuation_feature
import contrast_feature
import surface_patterns
import stars_feature
import numpy as np
import config
def extract_features(train_set, test_set):
"""
Extracts feature vectors of given train/test set.
Extraction based on selected features in config file.
Returns lists of feature vectors and a list of feature objects
for further use.
"""
f_selection_map = {'f1' : ngram_feature.NgramFeature(),
'f2' : pos_feature.PosFeature(),
'f3' : surface_patterns.SurfacePatternFeature(),
'f4' : sent_rating_feature.SentRatingFeature(),
'f5' : punctuation_feature.PunctuationFeature(),
'f6' : contrast_feature.ContrastFeature(),
'f7' : stars_feature.StarsFeature()
}
# get all feature objects of features selected in config
features = [f_selection_map[feat] for feat in config.feature_selection]
# load vocabulary if needed for feature
for feature in features:
try:
feature.load_vocabulary(train_set)
except AttributeError:
continue
train_inputs = [create_input_vector(features, instance) for instance in train_set]
test_inputs = [create_input_vector(features, instance) for instance in test_set]
# print stats
print("\nTotal features per train sample:\t{}".format(len(train_inputs[0])))
print("Number of train samples:\t\t{}".format(len(train_inputs)))
return train_inputs, test_inputs, features
def create_input_vector(features, corpus_instance):
"""
Create a feature vector for a single corpus instance
"""
vector = features[0].extract(corpus_instance)
if len(features) > 1:
for i in range(1, len(features)):
current_vec = features[i].extract(corpus_instance)
vector = np.append(vector, current_vec)
return vector
\ No newline at end of file
import sent_rating_feature
import ngram_feature
import pos_feature
import punctuation_feature
import contrast_feature
import numpy as np
import config
def create_vector(corpus_instance, uni_gram_vocab=None, pos_vocabulary=None, surface_vocabulary=None, lemma_vocab=None):
"""
Calls all feature extraction programms and combines
resulting arrays to a single input vector (for a
single corpus instance)
Example for corpus instance: OrderedDict([('LABEL', '0'), ('STARS', '5.0'), etc.
"""
# functions and their seperate arguments are stored in dict and only called when needed
# key : (func, [args])
f_map = {'f1' : (ngram_feature.extract, [corpus_instance, 'REVIEW', uni_gram_vocab]),
'f2' : (pos_feature.extract, [corpus_instance, pos_vocabulary]),
'f3' : (ngram_feature.extract, [corpus_instance, 'SURFACE_PATTERNS', surface_vocabulary]),
'f4' : (sent_rating_feature.extract, [corpus_instance]),
'f5' : (punctuation_feature.extract, [corpus_instance]),
'f6' : (contrast_feature.extract, [corpus_instance]),
'f7' : (extract_star_rating, [corpus_instance]),
'f8' : (ngram_feature.extract, [corpus_instance, 'LEMMAS', lemma_vocab])
}
fn, args = f_map[config.feature_selection[0]]
vector = fn(*args)
if len(config.feature_selection) > 1:
for i in range(1, len(config.feature_selection)):
fn, args = f_map[config.feature_selection[i]]
vector = np.append(vector, fn(*args))
return vector
def extract_features(train_set, test_set):
# vocabularies
n_gram_vocab = None
pos_bigram_vocab = None
sp_n_gram_vocab = None
lemma_n_gram_vocab = None
print("--------Feature Extraction-------")
if 'f1' in config.feature_selection:
n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', config.n_range_words)
if 'f2' in config.feature_selection:
pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
if 'f3' in config.feature_selection:
sp_n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', config.n_range_surface_patterns)
if 'f8' in config.feature_selection:
lemma_n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'LEMMAS', config.n_range_lemmas)
# inputs:
train_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, sp_n_gram_vocab, lemma_n_gram_vocab) #, bi_gram_vocab, tri_gram_vocab
for el in train_set] # 1000 vectors
test_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, sp_n_gram_vocab, lemma_n_gram_vocab) #, bi_gram_vocab, tri_gram_vocab
for el in test_set] # 254 vectors
# print stats
print("Total features per train sample: {}".format(len(train_inputs[0])))
print("Number of train samples: {}".format(len(train_inputs)))
return train_inputs, test_inputs
def extract_star_rating(corpus_instance):
return np.array([float(corpus_instance['STARS'])])
from feature import Feature
from sklearn.feature_extraction.text import CountVectorizer
import config
def extract(corpus_instance, corpus_dict_key, vocabulary):
"""
Extracts n-gram features from a single corpus instance.
n depends on vocabulary, which needs to be extracted using get_vocabulary.
Returns numpy array of size of vocabulary
class NgramFeature(Feature):
"""
n = len(list(vocabulary.keys())[0].split())
vectorizer = CountVectorizer(vocabulary=vocabulary, ngram_range=(n, n))
Class representing feature f1
vector = None
extract-method returns a feature-vector of length of its vocabulary
containing n-gram counts
"""
if corpus_dict_key == 'LEMMAS':
lemma_str = " ".join(corpus_instance['LEMMAS'])
vector = vectorizer.transform([lemma_str])
else:
vector = vectorizer.transform([corpus_instance[corpus_dict_key]]) # takes a list
name = "Bag-of-ngram"
corpus_key = 'REVIEW'
n_range = config.n_range_words
vocabulary = None
vectorizer = None
return vector.toarray()[0]
# def __init__(self, lemmatize=False):
# #TODO if lemmatize == True
def get_vocabulary(corpus, corpus_dict_key, n_range):
"""
Creates vocabulary based on given corpus.
"""
all_reviews = []
for line in corpus:
def extract(self, corpus_instance):
"""
Extracts n-gram features from a single corpus instance.
Returns numpy array of size of vocabulary
"""
vector = self.vectorizer.transform([corpus_instance[self.corpus_key]]) # takes a list
return vector.toarray()[0]
def load_vocabulary(self, corpus):
"""
Creates vocabulary based on given corpus (Only train-data!).
"""
all_reviews = []
for line in corpus:
all_reviews.append(line[self.corpus_key])
if corpus_dict_key == 'LEMMAS':
lemma_str = " ".join(line['LEMMAS'])
all_reviews.append(lemma_str)
vectorizer = CountVectorizer(ngram_range=self.n_range)
vectorizer.fit(all_reviews)
else:
all_reviews.append(line[corpus_dict_key])
self.vectorizer = vectorizer
self.vocabulary = vectorizer.vocabulary_
vectorizer = CountVectorizer(ngram_range=n_range)
vectorizer.fit(all_reviews)
if config.print_stats == True:
print("{} Vocab size (n={},{}):\t{}".format(self.name ,self.n_range[0], self.n_range[1], len(self.vocabulary)))
# print stats
if corpus_dict_key == 'SURFACE_PATTERNS':
print("SP {}-gram vocab size: {}".format(n_range[0],len(vectorizer.vocabulary_)))
elif corpus_dict_key == 'REVIEW':
print("BOW {}-gram vocab size: {}".format(n_range[0],len(vectorizer.vocabulary_)))
elif corpus_dict_key == 'LEMMAS':
print("Lemma {}-gram vocab size: {}".format(n_range[0],len(vectorizer.vocabulary_)))
return vectorizer.vocabulary_
def get_feature_names(self):
'''
Turn vocabulary dict. into list, where indices are equal to indices-keys of dict
'''
return sorted(self.vocabulary, key=self.vocabulary.get)
from feature import Feature
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
def extract(corpus_instance):
"""
Extracts single "contrast" feature from a single corpus instance.
Returns numpy array of size 1.
class SentRatingFeature(Feature):
"""
review = corpus_instance["REVIEW"]
stars = float(corpus_instance["STARS"])
#sent = get_sent_vader(review)
sent = get_sent_textblob(review)
if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0):
return np.array([1])
else:
return np.array([0])
Class representing feature f4
extract-method returns a feature-vector with one value indicating
if there is a contrast between the star rating and the sentiment
of the review, or not
def get_sent_vader(string):
analyser = SentimentIntensityAnalyzer()
sent = analyser.polarity_scores(string)
return sent['compound']
"""
def extract(self, corpus_instance):
"""
Extracts single "contrast" feature from a single corpus instance.
Returns numpy array of size 1.
"""
review = corpus_instance["REVIEW"]
stars = float(corpus_instance["STARS"])
#sent = self.__get_sent_vader(review)
sent = self.__get_sent_textblob(review)
if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0):
return np.array([1])
else:
return np.array([0])
def get_sent_textblob(string):
blob = TextBlob(string)
return blob.sentiment.polarity
# def __get_sent_vader(self, string):
# analyser = SentimentIntensityAnalyzer()
# sent = analyser.polarity_scores(string)
# return sent['compound']
def confusion_matrix(true_labels, predicted_labels):
matrix = np.zeros(shape=(2, 2))
def __get_sent_textblob(self, string):
blob = TextBlob(string)
return blob.sentiment.polarity
for true, pred in zip(true_labels, predicted_labels):
matrix[true][pred] += 1
return matrix
def get_feature_names(self):
return ['sent/rating-contrast']
\ No newline at end of file
from feature import Feature
import numpy as np
class StarsFeature(Feature):
"""
Class representing feature f7
extract-method returns a feature-vector with one value
holding the number of stars of a review
"""
def extract(self, corpus_instance):
return np.array([float(corpus_instance['STARS'])])
def get_feature_names(self):
return ['number_of_stars']
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment