From 5156a102ee80dee3857b6e6843d61d7b9a6c80de Mon Sep 17 00:00:00 2001
From: Maximilian Blunck <blunck@cl.uni-heidelberg.de>
Date: Thu, 18 Jan 2018 17:17:37 +0100
Subject: [PATCH] Added pos_tags and lemmas to corpus instance dict: Access via
 keys: "POS" & "LEMMAS"

---
 corpus.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/corpus.py b/corpus.py
index c23b90c..76317c7 100644
--- a/corpus.py
+++ b/corpus.py
@@ -2,8 +2,11 @@ import os, os.path
 import re
 import csv
 from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import nltk
 from random import shuffle
 
+
 def read_corpus(csv_corpus_path):
 	"""
 	Reads a csv-file and returns a list of dicts. 
@@ -21,6 +24,12 @@ def read_corpus(csv_corpus_path):
 			# tokenization
 			data["TOKENS"] = word_tokenize(row['REVIEW'])
 
+			# pos-tagging
+			data["POS"] = nltk.pos_tag(data["TOKENS"])
+
+			# lemmatizing
+			data["LEMMAS"] = get_lemmas(data["POS"])
+
 			corpus.append(data)
 
 	return corpus
@@ -92,6 +101,28 @@ def get_tag_content(tag, text):
 	return match[0].strip()
 
 
+def get_lemmas(instance_pos_tags):
+	lemmatizer = WordNetLemmatizer()
+	pos_map = {"VB" : "v", "NN" : "n", "JJ" : "a"}
+	lemmas = []
+
+	for pair in instance_pos_tags:
+			token = pair[0]
+			pos_tag = pair[1][0:2]
+
+			simple_pos = "n"
+
+			if pos_tag in pos_map.keys():
+				simple_pos = pos_map[pos_tag]
+
+			lemma = lemmatizer.lemmatize(token, pos=simple_pos)
+			lemmas.append(lemma)
+
+	return lemmas
+
+
+
+
 if __name__ == '__main__':
 	"""
 	corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"
-- 
GitLab