From 5864ba5ee4ccf078a2d15f8f9cfb375f4c47e4a6 Mon Sep 17 00:00:00 2001 From: Steffen Knapp <knapps@cl.uni-heidelberg.de> Date: Thu, 25 Jan 2018 15:52:09 +0100 Subject: [PATCH] added wacky html quotation marks --- punctuation_feature.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/punctuation_feature.py b/punctuation_feature.py index 904eae1..c86a0b5 100644 --- a/punctuation_feature.py +++ b/punctuation_feature.py @@ -7,6 +7,7 @@ def extract(corpus_instance): #"?!", "!?", "???", "!!!" are no lemmas allcaps = 0 excessive_punctuation = re.compile('[!?][!?]+') + html_quotes = re.compile('"') review_tokens = (word_tokenize(corpus_instance["TITLE"])) + corpus_instance["TOKENS"] review_lemmas = ((corpus_instance["TITLE"] + " " + corpus_instance["REVIEW"])) @@ -22,7 +23,7 @@ def extract(corpus_instance): corpus_instance_vector.append(len(re.findall(excessive_punctuation, review_lemmas))/len(review_lemmas)) corpus_instance_vector.append(allcaps/len(review_tokens)) - + corpus_instance_vector.append(len(re.findall(html_quotes, review_lemmas))/len(review_tokens)) return corpus_instance_vector @@ -33,5 +34,6 @@ if __name__ == '__main__': """ pass #corpus = read_corpus("minicorpus.csv") - #corpus_instance = corpus[3] + #corpus_instance = corpus[0] + #print(corpus_instance["REVIEW"]) #print(extract(corpus_instance)) -- GitLab