Added pos_tags and lemmas to corpus instance dict: Access via keys: "POS" & "LEMMAS"

5156a102 · blunck · 6812b516 · 5156a102
Commit 5156a102 authored 7 years ago by blunck
--- a/corpus.py
+++ b/corpus.py
@@ -2,8 +2,11 @@ import os, os.path
 import re
 import csv
 from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import nltk
 from random import shuffle

+
 def read_corpus(csv_corpus_path):
 	"""
 	Reads a csv-file and returns a list of dicts. 
@@ -21,6 +24,12 @@ def read_corpus(csv_corpus_path):
 			# tokenization
 			data["TOKENS"] = word_tokenize(row['REVIEW'])

+			# pos-tagging
+			data["POS"] = nltk.pos_tag(data["TOKENS"])
+
+			# lemmatizing
+			data["LEMMAS"] = get_lemmas(data["POS"])
+
 			corpus.append(data)

 	return corpus
@@ -92,6 +101,28 @@ def get_tag_content(tag, text):
 	return match[0].strip()


+def get_lemmas(instance_pos_tags):
+	lemmatizer = WordNetLemmatizer()
+	pos_map = {"VB" : "v", "NN" : "n", "JJ" : "a"}
+	lemmas = []
+
+	for pair in instance_pos_tags:
+			token = pair[0]
+			pos_tag = pair[1][0:2]
+
+			simple_pos = "n"
+
+			if pos_tag in pos_map.keys():
+				simple_pos = pos_map[pos_tag]
+
+			lemma = lemmatizer.lemmatize(token, pos=simple_pos)
+			lemmas.append(lemma)
+
+	return lemmas
+
+
+
+
 if __name__ == '__main__':
 	"""
 	corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"