From e702bcf9e1ea76bf69d5d3344df36acc9f977b08 Mon Sep 17 00:00:00 2001
From: Maximilian Blunck <blunck@cl.uni-heidelberg.de>
Date: Thu, 8 Feb 2018 00:29:12 +0100
Subject: [PATCH] Feature are now represented by classes

---
 contrast_feature.py    | 215 +++++++++++++++++++++--------------------
 feature.py             |  19 ++++
 feature_extraction.py  |  61 ++++++++++++
 features.py            |  78 ---------------
 ngram_feature.py       |  79 ++++++++-------
 sent_rating_feature.py |  60 ++++++------
 stars_feature.py       |  18 ++++
 7 files changed, 284 insertions(+), 246 deletions(-)
 create mode 100644 feature.py
 create mode 100644 feature_extraction.py
 delete mode 100644 features.py
 create mode 100644 stars_feature.py

diff --git a/contrast_feature.py b/contrast_feature.py
index 64d8369..86c9947 100644
--- a/contrast_feature.py
+++ b/contrast_feature.py
@@ -1,114 +1,121 @@
-import corpus
+from feature import Feature
 import nltk
 import numpy as np
-from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from textblob import TextBlob
 
-def get_phrase(i, n, tokens_only, tags_only):
-	#fourgram: n=4 
-	try:
-		pos_sent_phrase = tokens_only[i]
-		neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)])
 
-		try:
-			if tags_only[i-1] == 'R':
-				pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase
-		except IndexError:
-			return (pos_sent_phrase, neg_situation_phrase)
+class ContrastFeature(Feature):
+	"""
+	Class representing feature f6, based on Riloff et al. (2013)
+
+	extract-method returns a feature-vector of length 1 containing the number of 
+	contrasts found in a review
+
+	"""
+
+	def get_feature_names(self):
+		return ['riloff-contrast']
+
+
+	def extract(self, corpus_instance):
+		tokens = corpus_instance['TOKENS']
+		tagged = nltk.pos_tag(tokens)
+
+		tags_only = [y[0] for (x,y) in tagged]
+		tokens_only = [x for (x,y) in tagged]
+
+		# pos sentiment phrases
+		verb_phrase_list = ["V"]
+
+		# only situation pos-tag combos like the following should be matched
+		uni_pos_list = ["V"]
+		bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"]
+		tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"] 
+		excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N
+		excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"]
+
+		# generate possible pos-tag combintations
+		phrase_patterns = []
+		excl_N_phrase_patterns = []
+		excl_JN_phrase_patterns = []
+
+		for a in verb_phrase_list:
+			for b in uni_pos_list:
+				phrase_patterns.append(a+b)
+			for c in bi_pos_list:
+				phrase_patterns.append(a+c)
+			for d in tri_pos_list:
+				phrase_patterns.append(a+d)
+			for e in excl_N_tri_pos_list:
+				excl_N_phrase_patterns.append(a+e)
+			for f in excl_JN_tri_pos_list:
+				excl_JN_phrase_patterns.append(a+f)
+	  
+		contrasts = 0
+		candidates = []
+
+		# get all phrases matching the patterns
+		#TODO: elim doubles
+		for i in range(len(tags_only)):
+
+			fourgram = "".join(tags_only[i:(i+4)])
+			trigram = "".join(tags_only[i:(i+3)])
+			bigram = "".join(tags_only[i:(i+2)])
+
+			if fourgram in phrase_patterns:
+				candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
+
+			elif fourgram in excl_N_phrase_patterns:
+				try:
+					if tokens_only[i+4] != 'N':
+						candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
+				except IndexError:
+					candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
+
+			elif fourgram in excl_JN_phrase_patterns:
+				try:
+					if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J':
+						candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
+				except IndexError:
+					candidates.append(self.__get_phrase(i, 4, tokens_only, tags_only))
+					
+			elif trigram in phrase_patterns:
+				candidates.append(self.__get_phrase(i, 3, tokens_only, tags_only))
+
+			elif bigram in phrase_patterns:
+				candidates.append(self.__get_phrase(i, 2, tokens_only, tags_only))
+
+
+		# determine sentiment of extracted phrased
+		if candidates != []:
+			for phrase in candidates:
+				verb = phrase[0]
+				situation = phrase[1]
+
+				sent_verb = TextBlob(verb).sentiment.polarity
+				sent_situation = TextBlob(situation).sentiment.polarity
+
+				# if verb and situation are in contrast to another: increase feature value by one
+				if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0):
+					#print("phrase: {} {} sent verb: {}  sent situation: {}".format(verb, situation, sent_verb, sent_situation))
+					contrasts += 1
+
+		return np.array([contrasts])
+
 
-		return (pos_sent_phrase, neg_situation_phrase)
-
-	except IndexError:
-		pass
-
-def extract(corpus_instance):
-	tokens = corpus_instance['TOKENS']
-	tagged = nltk.pos_tag(tokens)
-
-	tags_only = [y[0] for (x,y) in tagged]
-	tokens_only = [x for (x,y) in tagged]
-
-	# pos sentiment phrases
-	verb_phrase_list = ["V"]
-
-	# only situation pos-tag combos like the following should be matched
-	uni_pos_list = ["V"]
-	bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"]
-	tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"] 
-	excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N
-	excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"]
-
-	# generate possible pos-tag comintations
-	phrase_patterns = []
-	excl_N_phrase_patterns = []
-	excl_JN_phrase_patterns = []
-
-	for a in verb_phrase_list:
-		for b in uni_pos_list:
-			phrase_patterns.append(a+b)
-		for c in bi_pos_list:
-			phrase_patterns.append(a+c)
-		for d in tri_pos_list:
-			phrase_patterns.append(a+d)
-		for e in excl_N_tri_pos_list:
-			excl_N_phrase_patterns.append(a+e)
-		for f in excl_JN_tri_pos_list:
-			excl_JN_phrase_patterns.append(a+f)
-  
-	contrasts = 0
-	candidates = []
-
-	# get all phrases matching the patterns
-	#TODO: elim doubles
-	for i in range(len(tags_only)):
-
-		fourgram = "".join(tags_only[i:(i+4)])
-		trigram = "".join(tags_only[i:(i+3)])
-		bigram = "".join(tags_only[i:(i+2)])
-
-		if fourgram in phrase_patterns:
-			candidates.append(get_phrase(i, 4, tokens_only, tags_only))
-
-		elif fourgram in excl_N_phrase_patterns:
-			try:
-				if tokens_only[i+4] != 'N':
-					candidates.append(get_phrase(i, 4, tokens_only, tags_only))
-			except IndexError:
-				candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+	def __get_phrase(self, i, n, tokens_only, tags_only):
+		# builds phrase corresponding to the matched POS-tag-combo
+		try:
+			pos_sent_phrase = tokens_only[i]
+			neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)])
 
-		elif fourgram in excl_JN_phrase_patterns:
 			try:
-				if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J':
-					candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+				if tags_only[i-1] == 'R':
+					pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase
 			except IndexError:
-				candidates.append(get_phrase(i, 4, tokens_only, tags_only))
-				
-		elif trigram in phrase_patterns:
-			candidates.append(get_phrase(i, 3, tokens_only, tags_only))
-
-		elif bigram in phrase_patterns:
-			candidates.append(get_phrase(i, 2, tokens_only, tags_only))
-
+				return (pos_sent_phrase, neg_situation_phrase)
 
-	# determine sentiment of extracted phrased
-	if candidates != []:
-		for phrase in candidates:
-			verb = phrase[0]
-			situation = phrase[1]
-
-			analyser = SentimentIntensityAnalyzer()
-			sent_verb = analyser.polarity_scores(verb)['compound']
-			sent_situation = analyser.polarity_scores(situation)['compound']
-
-			if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0):
-				#print("phrase: {} {} sent verb: {}  sent situation: {}".format(verb, situation, sent_verb, sent_situation))
-				contrasts += 1
-
-	return np.array([contrasts])
-
-
-# if __name__ == '__main__':
-# 	corpus = corpus.read_corpus("corpus_shuffled.csv")
-
-# 	for instance in corpus:
-# 		extract(instance)
+			return (pos_sent_phrase, neg_situation_phrase)
 
+		except IndexError:
+			pass
diff --git a/feature.py b/feature.py
new file mode 100644
index 0000000..6acd2d2
--- /dev/null
+++ b/feature.py
@@ -0,0 +1,19 @@
+class Feature:
+	"""Class representing an abstract feature
+	
+	extract():
+		- needs to be overwritten by subclasses
+		- should take a corpus instance (dict) as an arg
+		- should return np.array containing feature values
+
+	get_feature_names():
+		- needs to be overwritten by subclasses
+		- should return a list of feature descriptions
+		  corresponding to feature vector
+	"""
+
+	def extract():
+		raise NotImplementedError
+
+	def get_feature_names():
+		raise NotImplementedError
diff --git a/feature_extraction.py b/feature_extraction.py
new file mode 100644
index 0000000..c495dff
--- /dev/null
+++ b/feature_extraction.py
@@ -0,0 +1,61 @@
+import sent_rating_feature
+import ngram_feature
+import pos_feature
+import punctuation_feature
+import contrast_feature
+import surface_patterns
+import stars_feature
+import numpy as np
+import config
+
+
+def extract_features(train_set, test_set):
+    """
+    Extracts feature vectors of given train/test set.
+    Extraction based on selected features in config file.
+    Returns lists of feature vectors and a list of feature objects
+    for further use.
+
+    """
+    f_selection_map = {'f1' : ngram_feature.NgramFeature(),
+                       'f2' : pos_feature.PosFeature(), 
+                       'f3' : surface_patterns.SurfacePatternFeature(),
+                       'f4' : sent_rating_feature.SentRatingFeature(), 
+                       'f5' : punctuation_feature.PunctuationFeature(), 
+                       'f6' : contrast_feature.ContrastFeature(),
+                       'f7' : stars_feature.StarsFeature()
+                       }
+
+    # get all feature objects of features selected in config
+    features = [f_selection_map[feat] for feat in config.feature_selection]
+
+    # load vocabulary if needed for feature
+    for feature in features:
+        try:
+            feature.load_vocabulary(train_set)
+        except AttributeError:
+            continue
+
+    train_inputs = [create_input_vector(features, instance) for instance in train_set]
+    test_inputs = [create_input_vector(features, instance) for instance in test_set]
+
+    # print stats
+    print("\nTotal features per train sample:\t{}".format(len(train_inputs[0])))
+    print("Number of train samples:\t\t{}".format(len(train_inputs)))
+
+    return train_inputs, test_inputs, features
+
+
+def create_input_vector(features, corpus_instance):
+    """
+    Create a feature vector for a single corpus instance
+    """
+    vector = features[0].extract(corpus_instance)
+
+    if len(features) > 1:
+        for i in range(1, len(features)):
+            current_vec = features[i].extract(corpus_instance)
+            vector = np.append(vector, current_vec)
+
+    return vector
+    
\ No newline at end of file
diff --git a/features.py b/features.py
deleted file mode 100644
index 1e85b02..0000000
--- a/features.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import sent_rating_feature
-import ngram_feature
-import pos_feature
-import punctuation_feature
-import contrast_feature
-import numpy as np
-import config
-
-
-def create_vector(corpus_instance, uni_gram_vocab=None, pos_vocabulary=None, surface_vocabulary=None, lemma_vocab=None):
-    """
-    Calls all feature extraction programms and combines
-    resulting arrays to a single input vector (for a
-    single corpus instance)
-    Example for corpus instance: OrderedDict([('LABEL', '0'), ('STARS', '5.0'), etc.
-    """
-
-    # functions and their seperate arguments are stored in dict and only called when needed
-    # key : (func, [args]) 
-    f_map = {'f1' : (ngram_feature.extract, [corpus_instance, 'REVIEW', uni_gram_vocab]),
-             'f2' : (pos_feature.extract, [corpus_instance, pos_vocabulary]), 
-             'f3' : (ngram_feature.extract, [corpus_instance, 'SURFACE_PATTERNS', surface_vocabulary]), 
-             'f4' : (sent_rating_feature.extract, [corpus_instance]), 
-             'f5' : (punctuation_feature.extract, [corpus_instance]), 
-             'f6' : (contrast_feature.extract, [corpus_instance]),
-             'f7' : (extract_star_rating, [corpus_instance]),
-             'f8' : (ngram_feature.extract, [corpus_instance, 'LEMMAS', lemma_vocab])
-             }
-
-    fn, args = f_map[config.feature_selection[0]]
-    vector = fn(*args)
-    
-    if len(config.feature_selection) > 1:
-
-        for i in range(1, len(config.feature_selection)):
-            fn, args = f_map[config.feature_selection[i]]
-            vector = np.append(vector, fn(*args))
-
-    return vector
-
-
-def extract_features(train_set, test_set):
-    
-    # vocabularies
-    n_gram_vocab = None
-    pos_bigram_vocab = None
-    sp_n_gram_vocab = None
-    lemma_n_gram_vocab = None
-
-    print("--------Feature Extraction-------")
-
-    if 'f1' in config.feature_selection:
-        n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', config.n_range_words) 
-    if 'f2' in config.feature_selection:
-        pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
-    if 'f3' in config.feature_selection:
-        sp_n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', config.n_range_surface_patterns)
-    if 'f8' in config.feature_selection:
-        lemma_n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'LEMMAS', config.n_range_lemmas)
-
-    # inputs:
-    train_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, sp_n_gram_vocab, lemma_n_gram_vocab) #, bi_gram_vocab, tri_gram_vocab
-                    for el in train_set]  # 1000 vectors
-    test_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, sp_n_gram_vocab, lemma_n_gram_vocab) #, bi_gram_vocab, tri_gram_vocab
-                   for el in test_set]  # 254 vectors
-
-    # print stats
-    print("Total features per train sample:  {}".format(len(train_inputs[0])))
-    print("Number of train samples:          {}".format(len(train_inputs)))
-
-    return train_inputs, test_inputs
-
-def extract_star_rating(corpus_instance):
-    return np.array([float(corpus_instance['STARS'])])
-
-
-
-
diff --git a/ngram_feature.py b/ngram_feature.py
index 3808aa3..799cd88 100644
--- a/ngram_feature.py
+++ b/ngram_feature.py
@@ -1,50 +1,57 @@
+from feature import Feature
 from sklearn.feature_extraction.text import CountVectorizer
+import config
 
-
-def extract(corpus_instance, corpus_dict_key, vocabulary):
-	"""
-	Extracts n-gram features from a single corpus instance.
-	n depends on vocabulary, which needs to be extracted using get_vocabulary.
-	Returns numpy array of size of vocabulary
+class NgramFeature(Feature):
 	"""
-	n = len(list(vocabulary.keys())[0].split())
-	vectorizer = CountVectorizer(vocabulary=vocabulary, ngram_range=(n, n))
+	Class representing feature f1
 
-	vector = None
+	extract-method returns a feature-vector of length of its vocabulary 
+	containing n-gram counts
+	
+	"""
 
-	if corpus_dict_key == 'LEMMAS':
-		lemma_str = " ".join(corpus_instance['LEMMAS'])
-		vector = vectorizer.transform([lemma_str])
-	else:
-		vector = vectorizer.transform([corpus_instance[corpus_dict_key]]) # takes a list
+	name = "Bag-of-ngram"
+	corpus_key = 'REVIEW'
+	n_range = config.n_range_words
+	vocabulary = None
+	vectorizer = None
 	
-	return vector.toarray()[0]
 
+	# def __init__(self, lemmatize=False):
+	# 	#TODO if lemmatize == True
 
-def get_vocabulary(corpus, corpus_dict_key, n_range):
-	"""
-	Creates vocabulary based on given corpus.
-	"""
 
-	all_reviews = []
-	for line in corpus:
+	def extract(self, corpus_instance):
+		"""
+		Extracts n-gram features from a single corpus instance.
+		Returns numpy array of size of vocabulary
+		"""
+		vector = self.vectorizer.transform([corpus_instance[self.corpus_key]]) # takes a list
+		return vector.toarray()[0]
+
+		
+	def load_vocabulary(self, corpus):
+		"""
+		Creates vocabulary based on given corpus (Only train-data!).
+		"""
+		all_reviews = []
+
+		for line in corpus:
+			all_reviews.append(line[self.corpus_key])
 
-		if corpus_dict_key == 'LEMMAS':
-			lemma_str = " ".join(line['LEMMAS'])
-			all_reviews.append(lemma_str)
+		vectorizer = CountVectorizer(ngram_range=self.n_range)
+		vectorizer.fit(all_reviews)
 
-		else:
-			all_reviews.append(line[corpus_dict_key])
+		self.vectorizer = vectorizer
+		self.vocabulary = vectorizer.vocabulary_
 
-	vectorizer = CountVectorizer(ngram_range=n_range)
-	vectorizer.fit(all_reviews)
+		if config.print_stats == True:
+			print("{} Vocab size (n={},{}):\t{}".format(self.name ,self.n_range[0], self.n_range[1], len(self.vocabulary)))
 
-	# print stats
-	if corpus_dict_key == 'SURFACE_PATTERNS':
-		print("SP {}-gram vocab size:             {}".format(n_range[0],len(vectorizer.vocabulary_)))	
-	elif corpus_dict_key == 'REVIEW':
-		print("BOW {}-gram vocab size:            {}".format(n_range[0],len(vectorizer.vocabulary_)))	
-	elif corpus_dict_key == 'LEMMAS':
-		print("Lemma {}-gram vocab size:          {}".format(n_range[0],len(vectorizer.vocabulary_)))
 
-	return vectorizer.vocabulary_
+	def get_feature_names(self):
+		'''
+		Turn vocabulary dict. into list, where indices are equal to indices-keys of dict
+		'''
+		return sorted(self.vocabulary, key=self.vocabulary.get)
diff --git a/sent_rating_feature.py b/sent_rating_feature.py
index 16c68cf..53ee347 100644
--- a/sent_rating_feature.py
+++ b/sent_rating_feature.py
@@ -1,40 +1,44 @@
+from feature import Feature
 import numpy as np
-from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 from textblob import TextBlob
 
-
-def extract(corpus_instance):
-	"""
-	Extracts single "contrast" feature from a single corpus instance.
-	Returns numpy array of size 1.
+class SentRatingFeature(Feature):
 	"""
-	review = corpus_instance["REVIEW"]
-	stars = float(corpus_instance["STARS"])
-
-    #sent = get_sent_vader(review)
-	sent = get_sent_textblob(review)
-	
-	if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0):
-		return np.array([1])
-	else:
-		return np.array([0])
+	Class representing feature f4
 
+	extract-method returns a feature-vector with one value indicating
+	if there is a contrast between the star rating and the sentiment
+	of the review, or not
 
-def get_sent_vader(string):
-    analyser = SentimentIntensityAnalyzer()
-    sent = analyser.polarity_scores(string)
-    return sent['compound']
+	"""
 
+	def extract(self, corpus_instance):
+		"""
+		Extracts single "contrast" feature from a single corpus instance.
+		Returns numpy array of size 1.
+		"""
+		review = corpus_instance["REVIEW"]
+		stars = float(corpus_instance["STARS"])
+
+	    #sent = self.__get_sent_vader(review)
+		sent = self.__get_sent_textblob(review)
+		
+		if (sent <= 0.0 and stars > 3.0) or (sent > 0.0 and stars < 3.0):
+			return np.array([1])
+		else:
+			return np.array([0])
 
-def get_sent_textblob(string):
-    blob = TextBlob(string)
-    return blob.sentiment.polarity
+	
+	# def __get_sent_vader(self, string):
+	#     analyser = SentimentIntensityAnalyzer()
+	#     sent = analyser.polarity_scores(string)
+	#     return sent['compound']
 
 
-def confusion_matrix(true_labels, predicted_labels):
-    matrix = np.zeros(shape=(2, 2))
+	def __get_sent_textblob(self, string):
+	    blob = TextBlob(string)
+	    return blob.sentiment.polarity
 
-    for true, pred in zip(true_labels, predicted_labels):
-        matrix[true][pred] += 1
 
-    return matrix
+	def get_feature_names(self):
+		return ['sent/rating-contrast']
\ No newline at end of file
diff --git a/stars_feature.py b/stars_feature.py
new file mode 100644
index 0000000..6b3afe2
--- /dev/null
+++ b/stars_feature.py
@@ -0,0 +1,18 @@
+from feature import Feature
+import numpy as np
+
+class StarsFeature(Feature):
+	"""
+	Class representing feature f7
+
+	extract-method returns a feature-vector with one value
+	holding the number of stars of a review
+
+	"""
+
+	def extract(self, corpus_instance):
+		return np.array([float(corpus_instance['STARS'])])
+
+
+	def get_feature_names(self):
+		return ['number_of_stars']
\ No newline at end of file
-- 
GitLab