From 5156a102ee80dee3857b6e6843d61d7b9a6c80de Mon Sep 17 00:00:00 2001 From: Maximilian Blunck <blunck@cl.uni-heidelberg.de> Date: Thu, 18 Jan 2018 17:17:37 +0100 Subject: [PATCH] Added pos_tags and lemmas to corpus instance dict: Access via keys: "POS" & "LEMMAS" --- corpus.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/corpus.py b/corpus.py index c23b90c..76317c7 100644 --- a/corpus.py +++ b/corpus.py @@ -2,8 +2,11 @@ import os, os.path import re import csv from nltk.tokenize import word_tokenize +from nltk.stem import WordNetLemmatizer +import nltk from random import shuffle + def read_corpus(csv_corpus_path): """ Reads a csv-file and returns a list of dicts. @@ -21,6 +24,12 @@ def read_corpus(csv_corpus_path): # tokenization data["TOKENS"] = word_tokenize(row['REVIEW']) + # pos-tagging + data["POS"] = nltk.pos_tag(data["TOKENS"]) + + # lemmatizing + data["LEMMAS"] = get_lemmas(data["POS"]) + corpus.append(data) return corpus @@ -92,6 +101,28 @@ def get_tag_content(tag, text): return match[0].strip() +def get_lemmas(instance_pos_tags): + lemmatizer = WordNetLemmatizer() + pos_map = {"VB" : "v", "NN" : "n", "JJ" : "a"} + lemmas = [] + + for pair in instance_pos_tags: + token = pair[0] + pos_tag = pair[1][0:2] + + simple_pos = "n" + + if pos_tag in pos_map.keys(): + simple_pos = pos_map[pos_tag] + + lemma = lemmatizer.lemmatize(token, pos=simple_pos) + lemmas.append(lemma) + + return lemmas + + + + if __name__ == '__main__': """ corpus_path = "../corpus/SarcasmAmazonReviewsCorpus" -- GitLab