Completed Riloff contrast feature

9cfb91e3 · blunck · 5156a102 · 9cfb91e3
Commit 9cfb91e3 authored 7 years ago by blunck
--- a/contrast_feature.py
+++ b/contrast_feature.py
 import corpus
 import nltk
+import numpy as np
 from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

+def get_phrase(i, n, tokens_only, tags_only):
+	#fourgram: n=4 
+	try:
+		pos_sent_phrase = tokens_only[i]
+		neg_situation_phrase = " ".join(tokens_only[(i+1):(i+n)])
+
+		try:
+			if tags_only[i-1] == 'R':
+				pos_sent_phrase = tokens_only[i-1] +" "+ pos_sent_phrase
+		except IndexError:
+			return (pos_sent_phrase, neg_situation_phrase)
+
+		return (pos_sent_phrase, neg_situation_phrase)
+
+	except IndexError:
+		pass

 def extract(corpus_instance):
 	tokens = corpus_instance['TOKENS']
 	tagged = nltk.pos_tag(tokens)

-	# only pos-tag combos like the following should be matched
-	uni_pos_list = ["VB"]
-	bi_pos_list = ["VBVB", "VBRB", "RBVB", "TOVB", "VBNN", "VBNNP", "VBNNS", "VBPRP", "VBPRP$", "VBJJ", "VBJJS"]
-	#tri_pos_list = []
-
+	tags_only = [y[0] for (x,y) in tagged]
+	tokens_only = [x for (x,y) in tagged]
+
+	# pos sentiment phrases
+	verb_phrase_list = ["V"]
+
+	# only situation pos-tag combos like the following should be matched
+	uni_pos_list = ["V"]
+	bi_pos_list = ["VV", "VR", "RV", "TV", "VN", "VN", "VN", "VP", "VJ"]
+	tri_pos_list = ["VVV", "VVR", "VRV", "VVR", "VRR", "RVV", "VNR", "VIN", "VTV", "VIP"] 
+	excl_N_tri_pos_list = ["VVN", "VNN", "VJN", "VDN", "RVN"] # -JN = next tag is not J/N
+	excl_JN_tri_pos_list = ["VRJ", "VVJ", "VRJ", "RVJ"]
+
+	# generate possible pos-tag comintations
+	phrase_patterns = []
+	excl_N_phrase_patterns = []
+	excl_JN_phrase_patterns = []
+
+	for a in verb_phrase_list:
+		for b in uni_pos_list:
+			phrase_patterns.append(a+b)
+		for c in bi_pos_list:
+			phrase_patterns.append(a+c)
+		for d in tri_pos_list:
+			phrase_patterns.append(a+d)
+		for e in excl_N_tri_pos_list:
+			excl_N_phrase_patterns.append(a+e)
+		for f in excl_JN_tri_pos_list:
+			excl_JN_phrase_patterns.append(a+f)
+  
+	contrasts = 0
 	candidates = []

-	# go through all tags and find phrases: VB + tag-combo of list above
-	for i in range(len(tagged)):
-		if i+4 <= len(tagged):
-
-			# phrase should begin with verb
-			if tagged[i][1] == 'VB':
-
-				uni_pos = tagged[i+1][1]
-				bi_pos = tagged[i+1][1] + tagged[i+2][1]
-				#tri_pos = tagged[i+1][1] + tagged[i+2][1] + tagged[i+3][1]
-
-				#if tri_pos in tri_pos_list:
-
-				if bi_pos in bi_pos_list:
-
-					phrase = tagged[i:(i+3)]
-					candidates.append(phrase)
+	# get all phrases matching the patterns
+	#TODO: elim doubles
+	for i in range(len(tags_only)):
+
+		fourgram = "".join(tags_only[i:(i+4)])
+		trigram = "".join(tags_only[i:(i+3)])
+		bigram = "".join(tags_only[i:(i+2)])
+
+		if fourgram in phrase_patterns:
+			candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+
+		elif fourgram in excl_N_phrase_patterns:
+			try:
+				if tokens_only[i+4] != 'N':
+					candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+			except IndexError:
+				candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+
+		elif fourgram in excl_JN_phrase_patterns:
+			try:
+				if tokens_only[i+4] != 'N' and tokens_only[i+4] != 'J':
+					candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+			except IndexError:
+				candidates.append(get_phrase(i, 4, tokens_only, tags_only))
+				
+		elif trigram in phrase_patterns:
+			candidates.append(get_phrase(i, 3, tokens_only, tags_only))

-				elif uni_pos in uni_pos_list:
+		elif bigram in phrase_patterns:
+			candidates.append(get_phrase(i, 2, tokens_only, tags_only))

-					phrase = tagged[i:(i+2)]
-					candidates.append(phrase)

 	# determine sentiment of extracted phrased
 	if candidates != []:
 		for phrase in candidates:
-			verb = phrase[0][0]
-			situation = ""
-			for word in phrase[1:len(phrase)]:
-				situation += word[0] + " "
+			verb = phrase[0]
+			situation = phrase[1]

 			analyser = SentimentIntensityAnalyzer()
 			sent_verb = analyser.polarity_scores(verb)['compound']
 			sent_situation = analyser.polarity_scores(situation)['compound']

-			#if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0):
-			print("phrase: {} {} sent verb: {}  sent situation: {}".format(verb, situation, sent_verb, sent_situation))
-				
+			if (sent_verb > 0.0 and sent_situation < 0.0) or (sent_verb < 0.0 and sent_situation > 0.0):
+				#print("phrase: {} {} sent verb: {}  sent situation: {}".format(verb, situation, sent_verb, sent_situation))
+				contrasts += 1
+
+	return np.array([contrasts])
+

 if __name__ == '__main__':
-	corpus = corpus.read_corpus("corpus_shuffled.csv")[:1000]
+	corpus = corpus.read_corpus("corpus_shuffled.csv")

 	for instance in corpus:
-		extract(instance)
\ No newline at end of file
+		extract(instance)
+