diff --git a/contrast_feature.py b/contrast_feature.py index 64d8369ee8fa96fe0a244c1d840faa32f690d353..86c9947895529c54a7a748b1c30438419bd99d2c 100644 --- a/contrast_feature.py +++ b/contrast_feature.py @@ -1,114 +1,121 @@ -import corpus +from feature import Feature import nltk import numpy as np -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer +from textblob import TextBlob -def get_phrase(i, n, tokens_only, tags_only): - #fourgram: n=4 - try: - pos_sent_phrase = tokens_only[i] - neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)]) - try: - if tags_only[i-1] == 'R': - pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase - except IndexError: - return (pos_sent_phrase, neg_situation_phrase) +class ContrastFeature(Feature): + """ + Class representing feature f6, based on Riloff et al. (2013) + + extract-method returns a feature-vector of length 1 containing the number of + contrasts found in a review + + """ + + def get_feature_names(self): + return ['riloff-contrast'] + + + def extract(self, corpus_instance): + tokens = corpus_instance['TOKENS'] + tagged = nltk.pos_tag(tokens) + + tags_only = [y[0] for (x,y) in tagged] + tokens_only = [x for (x,y) in tagged] + + # pos sentiment phrases + verb_phrase_list = ["V"] + + # only situation pos-tag combos like the following should be matched + uni_pos_list = ["V"] + bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"] + tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"] + excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N + excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"] + + # generate possible pos-tag combintations + phrase_patterns = [] + excl_N_phrase_patterns = [] + excl_JN_phrase_patterns = [] + + for a in verb_phrase_list: + for b in uni_pos_list: + phrase_patterns.append(a+b) + for c in bi_pos_list: + phrase_patterns.append(a+c) + for d in tri_pos_list: + phrase_patterns.append(a+d) + for e in excl_N_tri_pos_list: + excl_N_phrase_patterns.append(a+e) + for f in excl_JN_tri_pos_list: + excl_JN_phrase_patterns.append(a+f) + + contrasts = 0 + candidates = [] + + # get all phrases matching the patterns + #TODO: elim doubles + for i in range(len(tags_only)): + + fourgram = "".join(tags_only[i:(i+4)]) + trigram = "".join(tags_only[i:(i+3)]) + bigram = "".join(tags_only[i:(i+2)]) + + if fourgram in phrase_patterns: + candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only)) + + elif fourgram in excl_N_phrase_patterns: + try: + if tokens_only[i+4] != 'N': + candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only)) + except IndexError: + candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only)) + + elif fourgram in excl_JN_phrase_patterns: + try: + if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J': + candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only)) + except IndexError: + candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only)) + + elif trigram in phrase_patterns: + candidates.append(self.__get_phrase(i, 3, tokens_only, tags_only)) + + elif bigram in phrase_patterns: + candidates.append(self.__get_phrase(i, 2, tokens_only, tags_only)) + + + # determine sentiment of extracted phrased + if candidates != []: + for phrase in candidates: + verb = phrase[0] + situation = phrase[1] + + sent_verb = TextBlob(verb).sentiment.polarity + sent_situation = TextBlob(situation).sentiment.polarity + + # if verb and situation are in contrast to another: increase feature value by one + if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0): + #print("phrase: {} {} sent verb: {} sent situation: {}".format(verb, situation, sent_verb, sent_situation)) + contrasts += 1 + + return np.array([contrasts]) + - return (pos_sent_phrase, neg_situation_phrase) - - except IndexError: - pass - -def extract(corpus_instance): - tokens = corpus_instance['TOKENS'] - tagged = nltk.pos_tag(tokens) - - tags_only = [y[0] for (x,y) in tagged] - tokens_only = [x for (x,y) in tagged] - - # pos sentiment phrases - verb_phrase_list = ["V"] - - # only situation pos-tag combos like the following should be matched - uni_pos_list = ["V"] - bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"] - tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"] - excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N - excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"] - - # generate possible pos-tag comintations - phrase_patterns = [] - excl_N_phrase_patterns = [] - excl_JN_phrase_patterns = [] - - for a in verb_phrase_list: - for b in uni_pos_list: - phrase_patterns.append(a+b) - for c in bi_pos_list: - phrase_patterns.append(a+c) - for d in tri_pos_list: - phrase_patterns.append(a+d) - for e in excl_N_tri_pos_list: - excl_N_phrase_patterns.append(a+e) - for f in excl_JN_tri_pos_list: - excl_JN_phrase_patterns.append(a+f) - - contrasts = 0 - candidates = [] - - # get all phrases matching the patterns - #TODO: elim doubles - for i in range(len(tags_only)): - - fourgram = "".join(tags_only[i:(i+4)]) - trigram = "".join(tags_only[i:(i+3)]) - bigram = "".join(tags_only[i:(i+2)]) - - if fourgram in phrase_patterns: - candidates.append(get_phrase(i, 4, tokens_only, tags_only)) - - elif fourgram in excl_N_phrase_patterns: - try: - if tokens_only[i+4] != 'N': - candidates.append(get_phrase(i, 4, tokens_only, tags_only)) - except IndexError: - candidates.append(get_phrase(i, 4, tokens_only, tags_only)) + def __get_phrase(self, i, n, tokens_only, tags_only): + # builds phrase corresponding to the matched POS-tag-combo + try: + pos_sent_phrase = tokens_only[i] + neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)]) - elif fourgram in excl_JN_phrase_patterns: try: - if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J': - candidates.append(get_phrase(i, 4, tokens_only, tags_only)) + if tags_only[i-1] == 'R': + pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase except IndexError: - candidates.append(get_phrase(i, 4, tokens_only, tags_only)) - - elif trigram in phrase_patterns: - candidates.append(get_phrase(i, 3, tokens_only, tags_only)) - - elif bigram in phrase_patterns: - candidates.append(get_phrase(i, 2, tokens_only, tags_only)) - + return (pos_sent_phrase, neg_situation_phrase) - # determine sentiment of extracted phrased - if candidates != []: - for phrase in candidates: - verb = phrase[0] - situation = phrase[1] - - analyser = SentimentIntensityAnalyzer() - sent_verb = analyser.polarity_scores(verb)['compound'] - sent_situation = analyser.polarity_scores(situation)['compound'] - - if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0): - #print("phrase: {} {} sent verb: {} sent situation: {}".format(verb, situation, sent_verb, sent_situation)) - contrasts += 1 - - return np.array([contrasts]) - - -# if __name__ == '__main__': -# corpus = corpus.read_corpus("corpus_shuffled.csv") - -# for instance in corpus: -# extract(instance) + return (pos_sent_phrase, neg_situation_phrase) + except IndexError: + pass diff --git a/feature.py b/feature.py new file mode 100644 index 0000000000000000000000000000000000000000..6acd2d246ddbdb217aa5eb72668e8a9d3ac89d6a --- /dev/null +++ b/feature.py @@ -0,0 +1,19 @@ +class Feature: + """Class representing an abstract feature + + extract(): + - needs to be overwritten by subclasses + - should take a corpus instance (dict) as an arg + - should return np.array containing feature values + + get_feature_names(): + - needs to be overwritten by subclasses + - should return a list of feature descriptions + corresponding to feature vector + """ + + def extract(): + raise NotImplementedError + + def get_feature_names(): + raise NotImplementedError diff --git a/feature_extraction.py b/feature_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..c495dff1a4541e2d7ecbe54246d663b4b0853065 --- /dev/null +++ b/feature_extraction.py @@ -0,0 +1,61 @@ +import sent_rating_feature +import ngram_feature +import pos_feature +import punctuation_feature +import contrast_feature +import surface_patterns +import stars_feature +import numpy as np +import config + + +def extract_features(train_set, test_set): + """ + Extracts feature vectors of given train/test set. + Extraction based on selected features in config file. + Returns lists of feature vectors and a list of feature objects + for further use. + + """ + f_selection_map = {'f1' : ngram_feature.NgramFeature(), + 'f2' : pos_feature.PosFeature(), + 'f3' : surface_patterns.SurfacePatternFeature(), + 'f4' : sent_rating_feature.SentRatingFeature(), + 'f5' : punctuation_feature.PunctuationFeature(), + 'f6' : contrast_feature.ContrastFeature(), + 'f7' : stars_feature.StarsFeature() + } + + # get all feature objects of features selected in config + features = [f_selection_map[feat] for feat in config.feature_selection] + + # load vocabulary if needed for feature + for feature in features: + try: + feature.load_vocabulary(train_set) + except AttributeError: + continue + + train_inputs = [create_input_vector(features, instance) for instance in train_set] + test_inputs = [create_input_vector(features, instance) for instance in test_set] + + # print stats + print("\nTotal features per train sample:\t{}".format(len(train_inputs[0]))) + print("Number of train samples:\t\t{}".format(len(train_inputs))) + + return train_inputs, test_inputs, features + + +def create_input_vector(features, corpus_instance): + """ + Create a feature vector for a single corpus instance + """ + vector = features[0].extract(corpus_instance) + + if len(features) > 1: + for i in range(1, len(features)): + current_vec = features[i].extract(corpus_instance) + vector = np.append(vector, current_vec) + + return vector + \ No newline at end of file diff --git a/features.py b/features.py deleted file mode 100644 index 1e85b0255f8986f8f54bcfb7f9e4eeba470d4293..0000000000000000000000000000000000000000 --- a/features.py +++ /dev/null @@ -1,78 +0,0 @@ -import sent_rating_feature -import ngram_feature -import pos_feature -import punctuation_feature -import contrast_feature -import numpy as np -import config - - -def create_vector(corpus_instance, uni_gram_vocab=None, pos_vocabulary=None, surface_vocabulary=None, lemma_vocab=None): - """ - Calls all feature extraction programms and combines - resulting arrays to a single input vector (for a - single corpus instance) - Example for corpus instance: OrderedDict([('LABEL', '0'), ('STARS', '5.0'), etc. - """ - - # functions and their seperate arguments are stored in dict and only called when needed - # key : (func, [args]) - f_map = {'f1' : (ngram_feature.extract, [corpus_instance, 'REVIEW', uni_gram_vocab]), - 'f2' : (pos_feature.extract, [corpus_instance, pos_vocabulary]), - 'f3' : (ngram_feature.extract, [corpus_instance, 'SURFACE_PATTERNS', surface_vocabulary]), - 'f4' : (sent_rating_feature.extract, [corpus_instance]), - 'f5' : (punctuation_feature.extract, [corpus_instance]), - 'f6' : (contrast_feature.extract, [corpus_instance]), - 'f7' : (extract_star_rating, [corpus_instance]), - 'f8' : (ngram_feature.extract, [corpus_instance, 'LEMMAS', lemma_vocab]) - } - - fn, args = f_map[config.feature_selection[0]] - vector = fn(*args) - - if len(config.feature_selection) > 1: - - for i in range(1, len(config.feature_selection)): - fn, args = f_map[config.feature_selection[i]] - vector = np.append(vector, fn(*args)) - - return vector - - -def extract_features(train_set, test_set): - - # vocabularies - n_gram_vocab = None - pos_bigram_vocab = None - sp_n_gram_vocab = None - lemma_n_gram_vocab = None - - print("--------Feature Extraction-------") - - if 'f1' in config.feature_selection: - n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', config.n_range_words) - if 'f2' in config.feature_selection: - pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set) - if 'f3' in config.feature_selection: - sp_n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', config.n_range_surface_patterns) - if 'f8' in config.feature_selection: - lemma_n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'LEMMAS', config.n_range_lemmas) - - # inputs: - train_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, sp_n_gram_vocab, lemma_n_gram_vocab) #, bi_gram_vocab, tri_gram_vocab - for el in train_set] # 1000 vectors - test_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, sp_n_gram_vocab, lemma_n_gram_vocab) #, bi_gram_vocab, tri_gram_vocab - for el in test_set] # 254 vectors - - # print stats - print("Total features per train sample: {}".format(len(train_inputs[0]))) - print("Number of train samples: {}".format(len(train_inputs))) - - return train_inputs, test_inputs - -def extract_star_rating(corpus_instance): - return np.array([float(corpus_instance['STARS'])]) - - - - diff --git a/ngram_feature.py b/ngram_feature.py index 3808aa3800392e55d810ded99da74d92b52a7dee..799cd88a9dd6dde4111458f235b9204987e86f5c 100644 --- a/ngram_feature.py +++ b/ngram_feature.py @@ -1,50 +1,57 @@ +from feature import Feature from sklearn.feature_extraction.text import CountVectorizer +import config - -def extract(corpus_instance, corpus_dict_key, vocabulary): - """ - Extracts n-gram features from a single corpus instance. - n depends on vocabulary, which needs to be extracted using get_vocabulary. - Returns numpy array of size of vocabulary +class NgramFeature(Feature): """ - n = len(list(vocabulary.keys())[0].split()) - vectorizer = CountVectorizer(vocabulary=vocabulary, ngram_range=(n, n)) + Class representing feature f1 - vector = None + extract-method returns a feature-vector of length of its vocabulary + containing n-gram counts + + """ - if corpus_dict_key == 'LEMMAS': - lemma_str = " ".join(corpus_instance['LEMMAS']) - vector = vectorizer.transform([lemma_str]) - else: - vector = vectorizer.transform([corpus_instance[corpus_dict_key]]) # takes a list + name = "Bag-of-ngram" + corpus_key = 'REVIEW' + n_range = config.n_range_words + vocabulary = None + vectorizer = None - return vector.toarray()[0] + # def __init__(self, lemmatize=False): + # #TODO if lemmatize == True -def get_vocabulary(corpus, corpus_dict_key, n_range): - """ - Creates vocabulary based on given corpus. - """ - all_reviews = [] - for line in corpus: + def extract(self, corpus_instance): + """ + Extracts n-gram features from a single corpus instance. + Returns numpy array of size of vocabulary + """ + vector = self.vectorizer.transform([corpus_instance[self.corpus_key]]) # takes a list + return vector.toarray()[0] + + + def load_vocabulary(self, corpus): + """ + Creates vocabulary based on given corpus (Only train-data!). + """ + all_reviews = [] + + for line in corpus: + all_reviews.append(line[self.corpus_key]) - if corpus_dict_key == 'LEMMAS': - lemma_str = " ".join(line['LEMMAS']) - all_reviews.append(lemma_str) + vectorizer = CountVectorizer(ngram_range=self.n_range) + vectorizer.fit(all_reviews) - else: - all_reviews.append(line[corpus_dict_key]) + self.vectorizer = vectorizer + self.vocabulary = vectorizer.vocabulary_ - vectorizer = CountVectorizer(ngram_range=n_range) - vectorizer.fit(all_reviews) + if config.print_stats == True: + print("{} Vocab size (n={},{}):\t{}".format(self.name ,self.n_range[0], self.n_range[1], len(self.vocabulary))) - # print stats - if corpus_dict_key == 'SURFACE_PATTERNS': - print("SP {}-gram vocab size: {}".format(n_range[0],len(vectorizer.vocabulary_))) - elif corpus_dict_key == 'REVIEW': - print("BOW {}-gram vocab size: {}".format(n_range[0],len(vectorizer.vocabulary_))) - elif corpus_dict_key == 'LEMMAS': - print("Lemma {}-gram vocab size: {}".format(n_range[0],len(vectorizer.vocabulary_))) - return vectorizer.vocabulary_ + def get_feature_names(self): + ''' + Turn vocabulary dict. into list, where indices are equal to indices-keys of dict + ''' + return sorted(self.vocabulary, key=self.vocabulary.get) diff --git a/sent_rating_feature.py b/sent_rating_feature.py index 16c68cfa0dd7fb0a2b1f0daf31b616b9b94eec0a..53ee347cbf28b139082e0546ee1137efc92742b7 100644 --- a/sent_rating_feature.py +++ b/sent_rating_feature.py @@ -1,40 +1,44 @@ +from feature import Feature import numpy as np -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from textblob import TextBlob - -def extract(corpus_instance): - """ - Extracts single "contrast" feature from a single corpus instance. - Returns numpy array of size 1. +class SentRatingFeature(Feature): """ - review = corpus_instance["REVIEW"] - stars = float(corpus_instance["STARS"]) - - #sent = get_sent_vader(review) - sent = get_sent_textblob(review) - - if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0): - return np.array([1]) - else: - return np.array([0]) + Class representing feature f4 + extract-method returns a feature-vector with one value indicating + if there is a contrast between the star rating and the sentiment + of the review, or not -def get_sent_vader(string): - analyser = SentimentIntensityAnalyzer() - sent = analyser.polarity_scores(string) - return sent['compound'] + """ + def extract(self, corpus_instance): + """ + Extracts single "contrast" feature from a single corpus instance. + Returns numpy array of size 1. + """ + review = corpus_instance["REVIEW"] + stars = float(corpus_instance["STARS"]) + + #sent = self.__get_sent_vader(review) + sent = self.__get_sent_textblob(review) + + if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0): + return np.array([1]) + else: + return np.array([0]) -def get_sent_textblob(string): - blob = TextBlob(string) - return blob.sentiment.polarity + + # def __get_sent_vader(self, string): + # analyser = SentimentIntensityAnalyzer() + # sent = analyser.polarity_scores(string) + # return sent['compound'] -def confusion_matrix(true_labels, predicted_labels): - matrix = np.zeros(shape=(2, 2)) + def __get_sent_textblob(self, string): + blob = TextBlob(string) + return blob.sentiment.polarity - for true, pred in zip(true_labels, predicted_labels): - matrix[true][pred] += 1 - return matrix + def get_feature_names(self): + return ['sent/rating-contrast'] \ No newline at end of file diff --git a/stars_feature.py b/stars_feature.py new file mode 100644 index 0000000000000000000000000000000000000000..6b3afe2eaccbdb62151059247ec904b990fc1b7b --- /dev/null +++ b/stars_feature.py @@ -0,0 +1,18 @@ +from feature import Feature +import numpy as np + +class StarsFeature(Feature): + """ + Class representing feature f7 + + extract-method returns a feature-vector with one value + holding the number of stars of a review + + """ + + def extract(self, corpus_instance): + return np.array([float(corpus_instance['STARS'])]) + + + def get_feature_names(self): + return ['number_of_stars'] \ No newline at end of file