Newer
Older
import os, os.path
import re
import csv
Maximilian Blunck
committed
from nltk.tokenize import word_tokenize
def read_corpus(csv_corpus_path):
"""
Reads a csv-file and returns a list of dicts.
Each dict represents one corpus file.
Maximilian Blunck
committed
Keys: ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW', 'TOKENS']
"""
corpus = []
with open(csv_corpus_path) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data = row
# tokenization
data["TOKENS"] = word_tokenize(row['REVIEW'])
corpus.append(data)
return corpus
def convert_corpus(corpus_path, out):
"""
Takes root path of raw Filatrova corpus and converts it into a single csv file.
"""
corpus_files = []
for root, dirs, files in os.walk(corpus_path):
for name in files:
if name.endswith((".txt")):
parent = root.split("/")[-1]
if parent == "Regular" or parent == "Ironic":
corpus_files.append(os.path.join(root, name))
with open(out, 'w') as csvfile:
fieldnames = ['LABEL', 'FILENAME', 'STARS', 'TITLE', 'DATE', 'AUTHOR', 'PRODUCT', 'REVIEW']
writer = csv.DictWriter(csvfile, fieldnames)
writer.writeheader()
for file_path in corpus_files:
file = open(file_path, encoding="ISO-8859-1")
s = file.read()
data = {}
label = file_path.split("/")[-2]
if label == "Ironic":
data[fieldnames[0]] = 1
elif label == "Regular":
data[fieldnames[0]] = 0
else:
raise ValueError("Label Error!")
data[fieldnames[1]] = file_path.split("/")[-1]
data[tag] = get_tag_content(tag, s)
writer.writerow(data)
print("Corpus written to: "+out)
def get_tag_content(tag, text):
"""
Helper for getting content between two xml-like tags
"""
pattern = r'<' + re.escape(tag) + r'>((?:\n|.)*?)</' + re.escape(tag) + r'>'
match = re.findall(pattern, text)
if len(match) != 1:
raise ValueError("Matching error!")
return match[0].strip()
if __name__ == '__main__':
"""
corpus_path = "../corpus/SarcasmAmazonReviewsCorpus"
convert_corpus(corpus_path, "corpus.csv")
Maximilian Blunck
committed
corpus = read_corpus("corpus.csv")
print("Corpus size: "+str(len(corpus)))
print(corpus[0].keys())
"""