Loading comparatives/fetch.py 0 → 100644 +74 −0 Original line number Diff line number Diff line """ Fetches all documents including comparative constructions. """ import os import sys PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def fetch(): jjr_count = 0 rbr_count = 0 jj_count = 0 for root, dirs, files in os.walk(PATH): for f in files: if f.endswith(".v4_gold_conll"): with open(os.path.join(root, f), "r") as document: contains_comparative = 0 current_doc_text = "" current_doc_id = 0 last_pos = "" last_word = "" for line in document: if line.strip(): if line.startswith("#"): continue cols = line.strip().split() if int(cols[1]) == current_doc_id + 1: if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") contains_comparative = 0 current_doc_text = "" current_doc_id += 1 current_doc_text += line if cols[4] == "JJR": contains_comparative += 1 jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": contains_comparative += 1 rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: contains_comparative += 1 jj_count += 1 print("JJ-NN") last_pos = cols[4] last_word = cols[3] else: current_doc_text += line #last document if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") print(jjr_count, rbr_count, jj_count) if __name__ == "__main__": fetch() Loading
comparatives/fetch.py 0 → 100644 +74 −0 Original line number Diff line number Diff line """ Fetches all documents including comparative constructions. """ import os import sys PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def fetch(): jjr_count = 0 rbr_count = 0 jj_count = 0 for root, dirs, files in os.walk(PATH): for f in files: if f.endswith(".v4_gold_conll"): with open(os.path.join(root, f), "r") as document: contains_comparative = 0 current_doc_text = "" current_doc_id = 0 last_pos = "" last_word = "" for line in document: if line.strip(): if line.startswith("#"): continue cols = line.strip().split() if int(cols[1]) == current_doc_id + 1: if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") contains_comparative = 0 current_doc_text = "" current_doc_id += 1 current_doc_text += line if cols[4] == "JJR": contains_comparative += 1 jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": contains_comparative += 1 rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: contains_comparative += 1 jj_count += 1 print("JJ-NN") last_pos = cols[4] last_word = cols[3] else: current_doc_text += line #last document if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") print(jjr_count, rbr_count, jj_count) if __name__ == "__main__": fetch()