added tokens to corpus file. the tokenized review can now be directly accesed via 'TOKENS' key

00a46f0d · Maximilian Blunck · f9848244 · 00a46f0d · 00a46f0d
Commit 00a46f0d authored 7 years ago by Maximilian Blunck
--- a/corpus.csv
+++ b/corpus.csv
--- a/corpus.py
+++ b/corpus.py
 import os, os.path
 import re
 import csv
+from nltk.tokenize import word_tokenize

 def read_corpus(csv_corpus_path):
 	"""
 	Reads a csv-file and returns a list of dicts. 
 	Each dict represents one corpus file.
-	Keys: ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW']
+	Keys: ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW', 'TOKENS']
 	"""
 	corpus = []

@@ -38,7 +39,7 @@ def convert_corpus(corpus_path, out):

 	with open(out, 'w') as csvfile:

-		fieldnames = ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW']
+		fieldnames = ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW', 'TOKENS']
 		writer = csv.DictWriter(csvfile, fieldnames)

 		writer.writeheader()
@@ -60,9 +61,13 @@ def convert_corpus(corpus_path, out):

 			data[fieldnames[1]] = file_path.split("/")[-1]

-			for tag in fieldnames[2:]:
+			for tag in fieldnames[2:-1]:
 				data[tag] = get_tag_content(tag, s)

+			# tokenization
+			tokens = word_tokenize(data['REVIEW'])
+			data["TOKENS"] = tokens
+
 			writer.writerow(data)

 	print("Corpus written to: "+out)
@@ -83,13 +88,14 @@ def get_tag_content(tag, text):


 if __name__ == '__main__':
-	"""
-	corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"
-	convert_corpus(corpus_path, "corpus.csv")
+	
+	#corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"
+	#convert_corpus(corpus_path, "corpus.csv")
+
+	#corpus = read_corpus("corpus.csv")
+	#print("Corpus size: "+str(len(corpus)))
+

-	corpus = read_corpus("corpus.csv")
-	print("Corpus size: "+str(len(corpus)))
-	"""