Loading comparatives/fetch.py +51 −13 Original line number Diff line number Diff line Loading @@ -4,24 +4,34 @@ Fetches all documents including comparative constructions. import os import sys import re PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def fetch(): def fetch(PATH, TARGET_PATH, qual="gold"): jjr_count = 0 rbr_count = 0 jj_count = 0 others_count = 0 for root, dirs, files in os.walk(PATH): for f in files: if f.endswith(".v4_gold_conll"): if f.endswith(".v4_{}_conll".format(qual)): with open(os.path.join(root, f), "r") as document: current_word_id = 0 contains_comparative = 0 current_doc_text = "" current_doc_id = 0 last_pos = "" last_word = "" stack = [] np_with_jj = False in_comp = False buff="" #buffer saves annotation after * for next line for line in document: if line.strip(): Loading @@ -35,7 +45,7 @@ def fetch(): if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") #print("Writing to "+target+".") contains_comparative = 0 current_doc_text = "" Loading @@ -43,22 +53,43 @@ def fetch(): current_doc_text += line if cols[4] == "JJR": synt = cols[5] new_tags = re.findall("[A-Z]*", synt) open_brackets = synt.count("(") closed_brackets = synt.count(")") stack+=new_tags if "NP" in new_tags: np_start_word_id = current_word_id np_with_jj = [] if cols[4] == "JJ" and "NP" in stack and cols[3] in COMPARATIVES: np_with_jj.append(cols[3]) if cols[4] == "JJR" and "NP" in stack: contains_comparative += 1 jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack: contains_comparative += 1 rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj: contains_comparative += 1 jj_count += 1 print("JJ-NN") elif cols[3].lower() == "others" and "NP" in stack: contains_comparative += 1 others_count += 1 last_pos = cols[4] last_word = cols[3] if closed_brackets > 0: del stack[-1*closed_brackets:] current_word_id += 1 else: current_doc_text += line Loading @@ -66,9 +97,16 @@ def fetch(): if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") print(jjr_count, rbr_count, jj_count) print(jjr_count, rbr_count, jj_count, others_count) if __name__ == "__main__": fetch() PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/" fetch(PATH, TARGET_PATH) PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/test/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/test/" fetch(PATH, TARGET_PATH, "auto") PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/development/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/dev/" fetch(PATH, TARGET_PATH) comparatives/format.py +114 −15 Original line number Diff line number Diff line Loading @@ -4,32 +4,52 @@ Fetches all documents including comparative constructions. import os import sys import re PATH = "/proj/zimmermann/comparatives/corpus/" TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def format(): c = [0,0,0,0] for root, dirs, files in os.walk(PATH): for f in files: with open(os.path.join(root, f), "r") as document: current_word_id = 0 markable_id = 0 comp_id = -1 set_id = 0 jjr_count = 0 rbr_count = 0 jj_count = 0 mmax = open(TARGET_PATH+f+".mmax",'w') mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f)) mmax.close() xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w') xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>') comp_level = open(TARGET_PATH+"Markables/"+f+"_comparison_level.xml",'w') comp_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/comparison">') coref_level = open(TARGET_PATH+"Markables/"+f+"_coref_level.xml",'w') coref_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/coref">') last_pos = "" last_word = "" stack = list() coref_stack = dict() np_with_jj = False comp_word = None in_comp = 0 buff="" #buffer saves annotation after * for next line for line in document: if line.strip(): Loading @@ -40,23 +60,102 @@ def format(): xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3])) if cols[4] == "JJR": jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: jj_count += 1 print("JJ-NN") synt = cols[5] new_tags = re.findall("[A-Z]+", synt) open_brackets = synt.count("(") closed_brackets = synt.count(")") stack+=new_tags if "NP" in new_tags: np_start_word_id = current_word_id np_with_jj = False if closed_brackets and "NP" in stack[-1*closed_brackets:] and in_comp and np_start_word_id <= comp_id <= current_word_id: comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, np_start_word_id, current_word_id, set_id)) in_comp = False markable_id += 1 set_id += 1 if cols[4] == "JJ" and cols[3] in COMPARATIVES: np_with_jj = True comp_id = current_word_id if cols[4] == "JJR" and "NP" in stack: set_id += 1 comp_id = current_word_id in_comp = True c[0] += 1 comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id)) markable_id += 1 elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack: set_id += 1 comp_id = current_word_id in_comp = True c[1] += 1 comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id-1, current_word_id ,set_id)) markable_id += 1 elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj: set_id += 1 in_comp = True c[2] += 1 comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, comp_id, set_id)) markable_id += 1 np_with_jj = False elif cols[3].lower() == "others": set_id += 1 comp_id = current_word_id c[3] += 1 comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id)) comp_level.write('<markable id="markable_{}" span="word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, current_word_id, set_id)) markable_id += 1 set_id += 1 last_pos = cols[4] last_word = cols[3] if closed_brackets > 0: del stack[-1*closed_brackets:] coref_anno = cols[-1] for coref in coref_anno.split("|"): start = False end = False buffr = "" for i, char in enumerate(coref): if char == "(": start = True elif char == ")": end = True elif char in "0123456789": buffr += char if start and end: coref_level.write('<markable id="markable_{}" span="word_{}" agreement="none" np_form="none" coref_class="set_{}" mmax_level="coref" semantic_class="none" type="none" grammatical_role="none" />'.format(markable_id, current_word_id, buffr)) markable_id += 1 elif start: if buffr not in coref_stack: coref_stack[buffr] = [current_word_id] else: coref_stack[buffr].append(current_word_id) elif end: coref_level.write('<markable id="markable_{}" span="word_{}..word_{}" agreement="none" np_form="none" coref_class="set_{}" mmax_level="coref" semantic_class="none" type="none" grammatical_role="none" />'.format(markable_id, coref_stack[buffr][-1], current_word_id, buffr)) markable_id += 1 current_word_id += 1 xml.write("</words>") xml.close() comp_level.write("</markables>") comp_level.close() coref_level.write("</markables>") coref_level.close() print(jjr_count, rbr_count, jj_count) print(c) if __name__ == "__main__": format() Loading
comparatives/fetch.py +51 −13 Original line number Diff line number Diff line Loading @@ -4,24 +4,34 @@ Fetches all documents including comparative constructions. import os import sys import re PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def fetch(): def fetch(PATH, TARGET_PATH, qual="gold"): jjr_count = 0 rbr_count = 0 jj_count = 0 others_count = 0 for root, dirs, files in os.walk(PATH): for f in files: if f.endswith(".v4_gold_conll"): if f.endswith(".v4_{}_conll".format(qual)): with open(os.path.join(root, f), "r") as document: current_word_id = 0 contains_comparative = 0 current_doc_text = "" current_doc_id = 0 last_pos = "" last_word = "" stack = [] np_with_jj = False in_comp = False buff="" #buffer saves annotation after * for next line for line in document: if line.strip(): Loading @@ -35,7 +45,7 @@ def fetch(): if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") #print("Writing to "+target+".") contains_comparative = 0 current_doc_text = "" Loading @@ -43,22 +53,43 @@ def fetch(): current_doc_text += line if cols[4] == "JJR": synt = cols[5] new_tags = re.findall("[A-Z]*", synt) open_brackets = synt.count("(") closed_brackets = synt.count(")") stack+=new_tags if "NP" in new_tags: np_start_word_id = current_word_id np_with_jj = [] if cols[4] == "JJ" and "NP" in stack and cols[3] in COMPARATIVES: np_with_jj.append(cols[3]) if cols[4] == "JJR" and "NP" in stack: contains_comparative += 1 jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack: contains_comparative += 1 rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj: contains_comparative += 1 jj_count += 1 print("JJ-NN") elif cols[3].lower() == "others" and "NP" in stack: contains_comparative += 1 others_count += 1 last_pos = cols[4] last_word = cols[3] if closed_brackets > 0: del stack[-1*closed_brackets:] current_word_id += 1 else: current_doc_text += line Loading @@ -66,9 +97,16 @@ def fetch(): if contains_comparative: target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id) open(target, "w").write(current_doc_text) print("Writing to "+target+".") print(jjr_count, rbr_count, jj_count) print(jjr_count, rbr_count, jj_count, others_count) if __name__ == "__main__": fetch() PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/" fetch(PATH, TARGET_PATH) PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/test/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/test/" fetch(PATH, TARGET_PATH, "auto") PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/development/data/english/" TARGET_PATH = "/proj/zimmermann/comparatives/corpus/dev/" fetch(PATH, TARGET_PATH)
comparatives/format.py +114 −15 Original line number Diff line number Diff line Loading @@ -4,32 +4,52 @@ Fetches all documents including comparative constructions. import os import sys import re PATH = "/proj/zimmermann/comparatives/corpus/" TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def format(): c = [0,0,0,0] for root, dirs, files in os.walk(PATH): for f in files: with open(os.path.join(root, f), "r") as document: current_word_id = 0 markable_id = 0 comp_id = -1 set_id = 0 jjr_count = 0 rbr_count = 0 jj_count = 0 mmax = open(TARGET_PATH+f+".mmax",'w') mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f)) mmax.close() xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w') xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>') comp_level = open(TARGET_PATH+"Markables/"+f+"_comparison_level.xml",'w') comp_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/comparison">') coref_level = open(TARGET_PATH+"Markables/"+f+"_coref_level.xml",'w') coref_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/coref">') last_pos = "" last_word = "" stack = list() coref_stack = dict() np_with_jj = False comp_word = None in_comp = 0 buff="" #buffer saves annotation after * for next line for line in document: if line.strip(): Loading @@ -40,23 +60,102 @@ def format(): xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3])) if cols[4] == "JJR": jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: jj_count += 1 print("JJ-NN") synt = cols[5] new_tags = re.findall("[A-Z]+", synt) open_brackets = synt.count("(") closed_brackets = synt.count(")") stack+=new_tags if "NP" in new_tags: np_start_word_id = current_word_id np_with_jj = False if closed_brackets and "NP" in stack[-1*closed_brackets:] and in_comp and np_start_word_id <= comp_id <= current_word_id: comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, np_start_word_id, current_word_id, set_id)) in_comp = False markable_id += 1 set_id += 1 if cols[4] == "JJ" and cols[3] in COMPARATIVES: np_with_jj = True comp_id = current_word_id if cols[4] == "JJR" and "NP" in stack: set_id += 1 comp_id = current_word_id in_comp = True c[0] += 1 comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id)) markable_id += 1 elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack: set_id += 1 comp_id = current_word_id in_comp = True c[1] += 1 comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id-1, current_word_id ,set_id)) markable_id += 1 elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj: set_id += 1 in_comp = True c[2] += 1 comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, comp_id, set_id)) markable_id += 1 np_with_jj = False elif cols[3].lower() == "others": set_id += 1 comp_id = current_word_id c[3] += 1 comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id)) comp_level.write('<markable id="markable_{}" span="word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, current_word_id, set_id)) markable_id += 1 set_id += 1 last_pos = cols[4] last_word = cols[3] if closed_brackets > 0: del stack[-1*closed_brackets:] coref_anno = cols[-1] for coref in coref_anno.split("|"): start = False end = False buffr = "" for i, char in enumerate(coref): if char == "(": start = True elif char == ")": end = True elif char in "0123456789": buffr += char if start and end: coref_level.write('<markable id="markable_{}" span="word_{}" agreement="none" np_form="none" coref_class="set_{}" mmax_level="coref" semantic_class="none" type="none" grammatical_role="none" />'.format(markable_id, current_word_id, buffr)) markable_id += 1 elif start: if buffr not in coref_stack: coref_stack[buffr] = [current_word_id] else: coref_stack[buffr].append(current_word_id) elif end: coref_level.write('<markable id="markable_{}" span="word_{}..word_{}" agreement="none" np_form="none" coref_class="set_{}" mmax_level="coref" semantic_class="none" type="none" grammatical_role="none" />'.format(markable_id, coref_stack[buffr][-1], current_word_id, buffr)) markable_id += 1 current_word_id += 1 xml.write("</words>") xml.close() comp_level.write("</markables>") comp_level.close() coref_level.write("</markables>") coref_level.close() print(jjr_count, rbr_count, jj_count) print(c) if __name__ == "__main__": format()