Loading source/extraction.py +125 −19 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ def from_isnotes(path): if file_name.endswith(".mmax"): print(file_name) #print(file_name) file_root = file_name[:-5] Loading Loading @@ -81,15 +81,14 @@ def from_conll(path): comp_count = 0 ment_count = 0 for line in conll: if line.startswith("#begin document"): stack = list() coref_stack = dict() for line in conll: if line.startswith("#begin document"): pass elif line.startswith("#end document"): if comp_count > 0: for m in mentions: print(m) return (ment_count, comp_count, mentions) pass elif line == "\n": stack = list() else: Loading Loading @@ -130,6 +129,7 @@ def from_conll(path): rules = list() rules.append(start[3]=="JJR") rules.append(text[start[0]].lower() == "more" and len(poss) > start[0]+1 and poss[start[0]+1] == "JJ") Loading Loading @@ -159,7 +159,121 @@ def from_conll(path): word_id += 1 return (ment_count, comp_count, mentions) return mentions def to_mmax(source_path, target_directory_path): """ Source must be in conll format """ name = os.path.splitext(os.path.basename(source_path)) base_name = name[0] print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name))) mentions = from_conll(source_path) comp_from_list = [m.comp_from for m in mentions] len_mentions = sum(bool(c) for c in comp_from_list) new_name = "{:0>2}_{}".format(len_mentions, base_name) if any(comp_from_list): mmax = open(os.path.join(target_directory_path, new_name+".mmax"), "w") mmax.write("""<?xml version="1.0"?> <mmax_project> <!--<sentences>002_htc_text.xml</sentences>--> <turns></turns> <words>{}</words> <gestures></gestures> <keyactions></keyactions> </mmax_project> """.format(new_name+"_words.xml")) mmax.close() target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml") source = open(source_path, 'r') target = open(target_path, 'w') word_id = 0 target.write(r"""<?xml version='1.0' encoding='UTF-8'?> <!DOCTYPE words SYSTEM "words.dtd"> <words>""") for line in source: if line.startswith("#begin document"): pass elif line.startswith("#end document"): pass elif line == "\n": pass else: naked_line = line.strip() split_line = naked_line.split() token = split_line[3] if token == "&": token = r"&" token = re.sub("<", "<", token) token = re.sub(">", ">", token) target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n') word_id += 1 target.write(r"</words>") print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name)) markable_id = 0 entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml") entity_level = open(entity_level_path, 'w') entity_level.write(r"""<?xml version='1.0'?> <!DOCTYPE markables SYSTEM "markables.dtd"> <markables xmlns="www.comp.leeds.ac.uk/markert/entity">""") coref_level_path = os.path.join(target_directory_path+"Markables", new_name+"_coref_level.xml") coref_level = open(coref_level_path, 'w') coref_level.write(r"""<?xml version='1.0'?> <!DOCTYPE markables SYSTEM "markables.dtd"> <markables xmlns="www.eml.org/NameSpaces/coref">""") coref_chains = set() for mention in mentions: if mention.comp_from: entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n') markable_id += 1 else: entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n') markable_id += 1 if mention.coref_set: coref_chains.add(mention.coref_set) coref_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" coref_set="set_'+str(mention.coref_set)+r'" mmax_level="coref" />'+'\n') markable_id += 1 else: pass entity_level.write(r'</markables>') coref_level.write(r'</markables>') entity_level.close() coref_level.close() print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name)) print("[NCR] Writing \033[92m{:>4}\033[0m coreference chain(s) to \033[1m{}_coref_level.xml\033[0m.".format(len(coref_chains), new_name)) else: print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name)) if __name__ == "__main__": Loading @@ -176,19 +290,11 @@ if __name__ == "__main__": files = [] for x in os.walk(root): for y in x[2]: print(y) #print(y) if y.endswith(".v4_gold_conll"): files.append(x[0]+"/"+y) mcount = 0 ccount = 0 dist = np.zeros(26) for f in files: plus_m, plus_c, mentions = from_conll(f) if plus_c > 0: dist[plus_c] += 1 mcount += plus_m ccount += plus_c print(mcount, ccount) print(dist[1:]) to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/") source/model.py +1 −0 Original line number Diff line number Diff line Loading @@ -20,3 +20,4 @@ class Mention: return '\033[92m'+plain_repr+'\033[0m' else: return plain_repr Loading
source/extraction.py +125 −19 Original line number Diff line number Diff line Loading @@ -16,7 +16,7 @@ def from_isnotes(path): if file_name.endswith(".mmax"): print(file_name) #print(file_name) file_root = file_name[:-5] Loading Loading @@ -81,15 +81,14 @@ def from_conll(path): comp_count = 0 ment_count = 0 for line in conll: if line.startswith("#begin document"): stack = list() coref_stack = dict() for line in conll: if line.startswith("#begin document"): pass elif line.startswith("#end document"): if comp_count > 0: for m in mentions: print(m) return (ment_count, comp_count, mentions) pass elif line == "\n": stack = list() else: Loading Loading @@ -130,6 +129,7 @@ def from_conll(path): rules = list() rules.append(start[3]=="JJR") rules.append(text[start[0]].lower() == "more" and len(poss) > start[0]+1 and poss[start[0]+1] == "JJ") Loading Loading @@ -159,7 +159,121 @@ def from_conll(path): word_id += 1 return (ment_count, comp_count, mentions) return mentions def to_mmax(source_path, target_directory_path): """ Source must be in conll format """ name = os.path.splitext(os.path.basename(source_path)) base_name = name[0] print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name))) mentions = from_conll(source_path) comp_from_list = [m.comp_from for m in mentions] len_mentions = sum(bool(c) for c in comp_from_list) new_name = "{:0>2}_{}".format(len_mentions, base_name) if any(comp_from_list): mmax = open(os.path.join(target_directory_path, new_name+".mmax"), "w") mmax.write("""<?xml version="1.0"?> <mmax_project> <!--<sentences>002_htc_text.xml</sentences>--> <turns></turns> <words>{}</words> <gestures></gestures> <keyactions></keyactions> </mmax_project> """.format(new_name+"_words.xml")) mmax.close() target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml") source = open(source_path, 'r') target = open(target_path, 'w') word_id = 0 target.write(r"""<?xml version='1.0' encoding='UTF-8'?> <!DOCTYPE words SYSTEM "words.dtd"> <words>""") for line in source: if line.startswith("#begin document"): pass elif line.startswith("#end document"): pass elif line == "\n": pass else: naked_line = line.strip() split_line = naked_line.split() token = split_line[3] if token == "&": token = r"&" token = re.sub("<", "<", token) token = re.sub(">", ">", token) target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n') word_id += 1 target.write(r"</words>") print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name)) markable_id = 0 entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml") entity_level = open(entity_level_path, 'w') entity_level.write(r"""<?xml version='1.0'?> <!DOCTYPE markables SYSTEM "markables.dtd"> <markables xmlns="www.comp.leeds.ac.uk/markert/entity">""") coref_level_path = os.path.join(target_directory_path+"Markables", new_name+"_coref_level.xml") coref_level = open(coref_level_path, 'w') coref_level.write(r"""<?xml version='1.0'?> <!DOCTYPE markables SYSTEM "markables.dtd"> <markables xmlns="www.eml.org/NameSpaces/coref">""") coref_chains = set() for mention in mentions: if mention.comp_from: entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n') markable_id += 1 else: entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n') markable_id += 1 if mention.coref_set: coref_chains.add(mention.coref_set) coref_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" coref_set="set_'+str(mention.coref_set)+r'" mmax_level="coref" />'+'\n') markable_id += 1 else: pass entity_level.write(r'</markables>') coref_level.write(r'</markables>') entity_level.close() coref_level.close() print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name)) print("[NCR] Writing \033[92m{:>4}\033[0m coreference chain(s) to \033[1m{}_coref_level.xml\033[0m.".format(len(coref_chains), new_name)) else: print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name)) if __name__ == "__main__": Loading @@ -176,19 +290,11 @@ if __name__ == "__main__": files = [] for x in os.walk(root): for y in x[2]: print(y) #print(y) if y.endswith(".v4_gold_conll"): files.append(x[0]+"/"+y) mcount = 0 ccount = 0 dist = np.zeros(26) for f in files: plus_m, plus_c, mentions = from_conll(f) if plus_c > 0: dist[plus_c] += 1 mcount += plus_m ccount += plus_c print(mcount, ccount) print(dist[1:]) to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")
source/model.py +1 −0 Original line number Diff line number Diff line Loading @@ -20,3 +20,4 @@ class Mention: return '\033[92m'+plain_repr+'\033[0m' else: return plain_repr