Skip to content
Snippets Groups Projects
Commit 5156a102 authored by blunck's avatar blunck
Browse files

Added pos_tags and lemmas to corpus instance dict: Access via keys: "POS" & "LEMMAS"

parent 6812b516
No related branches found
No related tags found
No related merge requests found
......@@ -2,8 +2,11 @@ import os, os.path
import re
import csv
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from random import shuffle
def read_corpus(csv_corpus_path):
"""
Reads a csv-file and returns a list of dicts.
......@@ -21,6 +24,12 @@ def read_corpus(csv_corpus_path):
# tokenization
data["TOKENS"] = word_tokenize(row['REVIEW'])
# pos-tagging
data["POS"] = nltk.pos_tag(data["TOKENS"])
# lemmatizing
data["LEMMAS"] = get_lemmas(data["POS"])
corpus.append(data)
return corpus
......@@ -92,6 +101,28 @@ def get_tag_content(tag, text):
return match[0].strip()
def get_lemmas(instance_pos_tags):
lemmatizer = WordNetLemmatizer()
pos_map = {"VB" : "v", "NN" : "n", "JJ" : "a"}
lemmas = []
for pair in instance_pos_tags:
token = pair[0]
pos_tag = pair[1][0:2]
simple_pos = "n"
if pos_tag in pos_map.keys():
simple_pos = pos_map[pos_tag]
lemma = lemmatizer.lemmatize(token, pos=simple_pos)
lemmas.append(lemma)
return lemmas
if __name__ == '__main__':
"""
corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment