Loading source/baselines.py +2 −3 Original line number Diff line number Diff line Loading @@ -18,7 +18,6 @@ except: model = kv.load_word2vec_format(fname, binary=False) print("Done. ") def next_mention(mentions, anaphora): sims = [1/(anaphora.span[0] - mention.span[1]) if mention.span[1] < anaphora.span[0] else 0 for mention in mentions] return mentions[np.argmax(sims)] Loading Loading @@ -57,9 +56,9 @@ if __name__ == "__main__": for f in files[:]: print("Fetching test data...") mentions = extraction.get_mentions(root+f) mentions = extraction.from_isnotes(root+f) comparatives = [m for m in mentions if m.comp_from] comparatives = [m for m in mentions if m.comp_from and m.comp_from != "outsidetext"] entities = [] entity_dict = {} Loading source/config.py +1 −0 Original line number Diff line number Diff line EMBEDDINGS_PATH = "/softpro/ss18/kernseife/kernseife/data/en/embeddings/glove.twitter.27B.100d.txt" DATA_PATH = "/proj/zimmermann/isnotes/ISClean/" CONLL_PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" CORPUS_PATH = "/home/students/zimmermann/Projects/ncr/corpus/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] source/extraction.py +350 −13 Original line number Diff line number Diff line Loading @@ -3,11 +3,14 @@ import os import config import re import spacy import en_core_web_sm import numpy as np from model import Mention, Entity from tqdm import tqdm nlp = spacy.load("en_core_web_lg") nlp = en_core_web_sm.load() #nlp = spacy.load("en") def from_isnotes(path): Loading @@ -26,6 +29,17 @@ def from_isnotes(path): entities = BeautifulSoup(open(root+"/markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml") coref = BeautifulSoup(open(root+"/markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml") spans = dict() word_dict = dict() coref_dict = {None:[]} corpus = open(config.CORPUS_PATH+file_root+".ncr","w") for i, word in enumerate(basedata.contents[2].contents[0].contents[0].find_all()): corpus.write(str(i)+"\t"+word.contents[0]+"\n") corpus.write("\n") for entity in entities.contents[2].contents[0].contents[0].find_all(): span = entity["span"] Loading @@ -45,6 +59,8 @@ def from_isnotes(path): try: coref_set = coref.find("markable", {"span": span})["coref_set"] if coref_set not in coref_dict: coref_dict[coref_set] = list() except: coref_set = None Loading @@ -57,6 +73,12 @@ def from_isnotes(path): mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from)) spans[mention_id] = [span_ids[0]-1,span_ids[1]-1] if coref_set in coref_dict: coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1]) else: coref_dict[None].append([span_ids[0]-1,span_ids[1]-1]) else: token = basedata.find("word", {"id": span}).contents[0] Loading @@ -65,6 +87,35 @@ def from_isnotes(path): word_id = int(span.split("_")[-1]) mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from)) spans[mention_id] = [word_id-1, word_id-1] if coref_set in coref_dict: coref_dict[coref_set].append([word_id-1, word_id-1]) else: coref_dict[None].append([word_id-1, word_id-1]) for m in mentions: if m.comp_from: if m.comp_from == "outsidetext": corpline = spans[m.mention_id] corpus.write("\t".join([str(c) for c in corpline])+"\n") else: if m.comp_from.startswith("markable"): corpline = spans[m.mention_id] cfroms = m.comp_from.split(";") for cfrom in cfroms: corpline += spans[cfrom] corpus.write("\t".join([str(c) for c in corpline])+"\n") corpus.write("\n") for c in coref_dict.items(): if c[0] != None: corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n") for c in coref_dict[None]: corpus.write("\t".join([str(x) for x in c])+"\n") return mentions def from_conll(path): Loading @@ -90,8 +141,9 @@ def from_conll(path): sentence_id = 0 #doc = nlp(" ".join(text)) doc = nlp.tokenizer.tokens_from_list(text) nlp.tagger(doc) nlp.parser(doc) for name, proc in nlp.pipeline: doc=proc(doc) #doc = spacy.tokens.doc.Doc(nlp.vocab, words=text, spaces=[True]*len(text)) for chunk in doc.noun_chunks: tokens = chunk.text.split(" ") Loading Loading @@ -284,6 +336,273 @@ def to_mmax(source_path, target_directory_path): print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name)) def mark_comparatives(source_path, target_directory_path): """ Source must be in conll format """ name = os.path.splitext(os.path.basename(source_path)) base_name = name[0] corpus = open(config.CORPUS_PATH+basename+".txt","w") all_tokens = list() print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name))) mentions = from_conll(source_path) comp_from_list = [m.comp_from for m in mentions] len_mentions = sum(bool(c) for c in comp_from_list) new_name = "{:0>2}_{}".format(len_mentions, base_name) if any(comp_from_list): mmax = open(os.path.join(target_directory_path, new_name+".mmax"), "w") mmax.write("""<?xml version="1.0"?> <mmax_project> <!--<sentences>002_htc_text.xml</sentences>--> <turns></turns> <words>{}</words> <gestures></gestures> <keyactions></keyactions> </mmax_project> """.format(new_name+"_words.xml")) mmax.close() target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml") source = open(source_path, 'r') target = open(target_path, 'w') word_id = 0 target.write(r"""<?xml version='1.0' encoding='UTF-8'?> <!DOCTYPE words SYSTEM "words.dtd"> <words>""") for line in source: if line.startswith("#begin document"): pass elif line.startswith("#end document"): pass elif line == "\n": corpus.write(" ".join(all_tokens)+"\n") else: naked_line = line.strip() split_line = naked_line.split() token = split_line[3] if token == "&": token = r"&" token = re.sub("<", "<", token) token = re.sub(">", ">", token) target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n') word_id += 1 target.write(r"</words>") print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name)) markable_id = 0 entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml") entity_level = open(entity_level_path, 'w') entity_level.write(r"""<?xml version='1.0'?> <!DOCTYPE markables SYSTEM "markables.dtd"> <markables xmlns="www.comp.leeds.ac.uk/markert/entity">""") coref_chains = set() for mention in mentions: if mention.comp_from: entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n') markable_id += 1 else: pass entity_level.write(r'</markables>') entity_level.close() print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name)) else: print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name)) def merge_annotation(entity_file, annotation_file): path = entity_file split_path = path.split(r"/") root, file_name = "/".join(split_path[:-1]), split_path[-1] processed = [] for path, subdirs, files in os.walk(config.CORPUS_PATH): for name in files: processed.append(name) mentions = [] file_root = file_name[:-5] new_root = "_".join(file_name[:-5].split("_")[1:]) if file_name.endswith(".mmax") and new_root+".ncr" not in processed: basedata = BeautifulSoup(open(root+"/Basedata/"+file_root+"_words.xml", "r").read(), features="lxml") entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml") coref = BeautifulSoup(open(root+"/Markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml") spans = dict() word_dict = dict() coref_dict = {None:[]} span_set = set() match_count = 0 total_count = 0 bd = basedata.contents[2].contents[0].contents[0].find_all() for entity in tqdm(entities.contents[2].contents[0].contents[0].find_all()): span = entity["span"] tokens = [] mention_id = entity["id"] # comparative: either "outsidetext" or antecedent id comp_from = None try: coref_set = coref.find("markable", {"span": span})["coref_set"] if coref_set not in coref_dict: coref_dict[coref_set] = list() except: coref_set = None if ".." in span: split_span = span.split("..") span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]] for i in range(span_ids[0],span_ids[1]+1): token = basedata.find("word", {"id": "word_"+str(i)}).contents[0] tokens.append(token) mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from)) spans[mention_id] = [span_ids[0]-1,span_ids[1]-1] span_set.add((span_ids[0]-1,span_ids[1]-1)) if coref_set in coref_dict: coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1]) else: coref_dict[None].append([span_ids[0]-1,span_ids[1]-1]) else: token = basedata.find("word", {"id": span}).contents[0] tokens.append(token) word_id = int(span.split("_")[-1]) mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from)) spans[mention_id] = [word_id-1, word_id-1] span_set.add((word_id-1, word_id-1)) if coref_set in coref_dict: coref_dict[coref_set].append([word_id-1, word_id-1]) else: coref_dict[None].append([word_id-1, word_id-1]) path = annotation_file split_path = path.split(r"/") root, file_name = "/".join(split_path[:-1]), split_path[-1] file_root = file_name[:-5] new_root = "_".join(file_name[:-5].split("_")[1:]) new_spans = dict() entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml") word_dict = dict() mention_ids = -1 for entity in entities.contents[2].contents[0].contents[0].find_all(): span = entity["span"] tokens = [] mention_id = entity["id"] comp_from = None if entity["information_status"] == "mediated": try: if entity["mediated_type"] == "comparative": if entity["comparative_type"] == "withintext" and entity["comp_from"] != None: comp_from = entity["comp_from"] elif entity["comparative_type"] == "outsidetext": comp_from = "outsidetext" except: pass if ".." in span: split_span = span.split("..") span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]] mentions.append(Mention(None, [span_ids[0],span_ids[1]], mention_ids, None, comp_from)) new_spans[mention_id] = [span_ids[0]-1,span_ids[1]-1] spans[mention_ids] = [span_ids[0]-1,span_ids[1]-1] test_span = [span_ids[0]-1,span_ids[1]-1] mention_ids -= 1 else: word_id = int(span.split("_")[-1]) mentions.append(Mention(None, [word_id, word_id], mention_ids, None, comp_from)) new_spans[mention_id] = [word_id-1, word_id-1] spans[mention_ids] = [word_id-1, word_id-1] test_span = [word_id-1, word_id-1] mention_ids -= 1 if not comp_from: total_count += 1 if (test_span[0],test_span[1]) in span_set: match_count += 1 comp_froms = [m.comp_from for m in mentions] if any(comp_froms): corpus = open(config.CORPUS_PATH+new_root+".ncr","w") for i, word in enumerate(bd): corpus.write(str(i)+"\t"+word.contents[0]+"\n") corpus.write("\n") for m in mentions: if m.comp_from: if m.comp_from == "outsidetext": corpline = spans[m.mention_id] corpus.write("\t".join([str(c) for c in corpline])+"\n") else: if m.comp_from.startswith("markable"): corpline = spans[m.mention_id] cfroms = m.comp_from.split(";") for cfrom in cfroms: corpline += new_spans[cfrom] corpus.write("\t".join([str(c) for c in corpline])+"\n") corpus.write("\n") for c in coref_dict.items(): if c[0] != None: corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n") for c in coref_dict[None]: corpus.write("\t".join([str(x) for x in c])+"\n") return match_count, total_count return 0, 0 if __name__ == "__main__": # ISnotes #root = config.DATA_PATH Loading @@ -295,14 +614,32 @@ if __name__ == "__main__": # conll/OntoNotes root = config.CONLL_PATH files = [] for x in os.walk(root): for y in x[2]: #print(y) if y.endswith(".v4_gold_conll"): files.append(x[0]+"/"+y) #files = [] #for x in os.walk(root): #for y in x[2]: ##print(y) #if y.endswith(".v4_gold_conll"): #files.append(x[0]+"/"+y) #for f in files: #to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/") other_files = os.listdir("/home/students/zimmermann/Projects/ncr/annotation/Entity/") root = "/home/students/zimmermann/Projects/ncr/annotation/Freestyle/" files = os.listdir(root) mc, tc = 0,0 for f in files: to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/") for f in sorted(files)[::-1]: if f.endswith(".mmax"): split_path = f.split(r"/") _, file_name = "/".join(split_path[:-1]), split_path[-1] print(f) m,t = merge_annotation("/home/students/zimmermann/Projects/ncr/annotation/Entity/"+[s for s in other_files if s.endswith("_".join(f.split("_")[1:]))][0], root+f) mc += m tc += t print(mc, tc, 0 if tc==0 else mc/tc) source/new_baselines.py 0 → 100644 +184 −0 Original line number Diff line number Diff line import os import argparse import config import spacy import en_core_web_sm import gensim import numpy as np from gensim.models import word2vec from gensim.models import KeyedVectors as kv nlp = en_core_web_sm.load() fname = config.EMBEDDINGS_PATH print("Loading model.", end="\r") try: model = kv.load(fname) except: model = kv.load_word2vec_format(fname, binary=False) print("Done. ") def closest_preceding_mention(toks, ents, ana): best_ment = (0,0) for ent in ents: for ment in ent: if ana[0] - ment[1] < ana[0] - best_ment[1] and ment[1] < ana[0]: best_ment = ment return {best_ment} def head_match(toks, ents, ana, window=20): best_ment = (0,0) ana_toks = toks[ana[0]:ana[1]+1] ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks] doc = nlp.tokenizer.tokens_from_list(ana_toks) for name, proc in nlp.pipeline: doc=proc(doc) ana_head = [token for token in doc if token.head == token][0].lemma_ for ent in ents: for ment in ent: ment_toks = toks[ment[0]:ment[1]+1] ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks] doc = nlp.tokenizer.tokens_from_list(ment_toks) for name, proc in nlp.pipeline: doc=proc(doc) try: ment_head = [token for token in doc if token.head == token][0].lemma_ except IndexError: continue if ana_head == ment_head: if ana[0] - ment[1] < ana[0] - best_ment[1] and ment[1] < ana[0] and ana[0] - ment[1] < window: best_ment = ment if best_ment == (0,0): return closest_preceding_mention(toks, ents, ana) else: return {best_ment} def highest_embedding_similarity(toks, ents, ana): best_ment = (0,0) best_sim = 0 ana_toks = toks[ana[0]:ana[1]+1] ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks] doc = nlp.tokenizer.tokens_from_list(ana_toks) for name, proc in nlp.pipeline: doc=proc(doc) ana_head = [token for token in doc if token.head == token][0].text for ent in ents: for ment in ent: ment_toks = toks[ment[0]:ment[1]+1] ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks] doc = nlp.tokenizer.tokens_from_list(ment_toks) for name, proc in nlp.pipeline: doc=proc(doc) try: ment_head = [token for token in doc if token.head == token][0].text except IndexError: continue sim = 1/(model.wv.wmdistance(ment_head, ana_head)+1) if best_sim <= sim and ment[1] < ana[0]: print(ment_head, sim, ana_head) best_ment = ment best_sim = sim if best_ment == (0,0): return closest_preceding_mention(toks, ents, ana) else: return {best_ment} def most_salient_entity(toks, ents, ana): best_ent = {(0,0)} max_ent = 0 for ent in ents: if len(ent) >= max_ent and any([ment[1] < ana[0] for ment in ent]): max_ent = len(ent) best_ent = ent return best_ent def largest_entity_span(toks, ents, ana): best_ent = {(0,0)} max_span = 0 for ent in ents: if ent: start = min([x for x,y in ent]) end = max([y for x,y in ent]) span = end-start if span >= max_span and any([ment[1] < ana[0] for ment in ent]): max_span = span best_ent = ent if best_ment == {(0,0)}: return closest_preceding_mention(toks, ents, ana) else: return {best_ment} def import_doc(path): doc = open(path, 'r').read() tokens, comps, coref = doc.split("\n\n") tokens = [line.strip().split()[1] for line in tokens.split("\n")] anaphora = dict() for line in comps.split("\n"): line = [int(l) for l in line.strip().split()] if len(line) > 2: anaphora[(line[0],line[1])] = {(line[i],line[i+1]) for i in range(2,len(line),2)} else: anaphora[(line[0],line[1])] = set() entities = list() for line in coref.split("\n"): line = [int(l) for l in line.strip().split()] entities.append([(line[i],line[i+1]) for i in range(0,len(line),2)]) return tokens, anaphora, entities if __name__ == "__main__": root = "/home/students/zimmermann/Projects/ncr/corpus/" files = os.listdir(root) baseline = head_match correct = 0 incorrect = 0 for f in files: path = root+f print(path) toks, anas, ents = import_doc(path) for ana in anas.keys(): if anas[ana]: #ents = set.union(*anas.values(), *ents) system = baseline(toks, ents, ana) #if anas[ana] == system: if anas[ana] & set(system) != set(): correct += 1 else: incorrect += 1 print(correct, incorrect, correct/(correct+incorrect)) source/remove_coordinated_syntactic_anaphora.py 0 → 100644 +77 −0 File added.Preview size limit exceeded, changes collapsed. Show changes Loading
source/baselines.py +2 −3 Original line number Diff line number Diff line Loading @@ -18,7 +18,6 @@ except: model = kv.load_word2vec_format(fname, binary=False) print("Done. ") def next_mention(mentions, anaphora): sims = [1/(anaphora.span[0] - mention.span[1]) if mention.span[1] < anaphora.span[0] else 0 for mention in mentions] return mentions[np.argmax(sims)] Loading Loading @@ -57,9 +56,9 @@ if __name__ == "__main__": for f in files[:]: print("Fetching test data...") mentions = extraction.get_mentions(root+f) mentions = extraction.from_isnotes(root+f) comparatives = [m for m in mentions if m.comp_from] comparatives = [m for m in mentions if m.comp_from and m.comp_from != "outsidetext"] entities = [] entity_dict = {} Loading
source/config.py +1 −0 Original line number Diff line number Diff line EMBEDDINGS_PATH = "/softpro/ss18/kernseife/kernseife/data/en/embeddings/glove.twitter.27B.100d.txt" DATA_PATH = "/proj/zimmermann/isnotes/ISClean/" CONLL_PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/" CORPUS_PATH = "/home/students/zimmermann/Projects/ncr/corpus/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]
source/extraction.py +350 −13 Original line number Diff line number Diff line Loading @@ -3,11 +3,14 @@ import os import config import re import spacy import en_core_web_sm import numpy as np from model import Mention, Entity from tqdm import tqdm nlp = spacy.load("en_core_web_lg") nlp = en_core_web_sm.load() #nlp = spacy.load("en") def from_isnotes(path): Loading @@ -26,6 +29,17 @@ def from_isnotes(path): entities = BeautifulSoup(open(root+"/markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml") coref = BeautifulSoup(open(root+"/markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml") spans = dict() word_dict = dict() coref_dict = {None:[]} corpus = open(config.CORPUS_PATH+file_root+".ncr","w") for i, word in enumerate(basedata.contents[2].contents[0].contents[0].find_all()): corpus.write(str(i)+"\t"+word.contents[0]+"\n") corpus.write("\n") for entity in entities.contents[2].contents[0].contents[0].find_all(): span = entity["span"] Loading @@ -45,6 +59,8 @@ def from_isnotes(path): try: coref_set = coref.find("markable", {"span": span})["coref_set"] if coref_set not in coref_dict: coref_dict[coref_set] = list() except: coref_set = None Loading @@ -57,6 +73,12 @@ def from_isnotes(path): mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from)) spans[mention_id] = [span_ids[0]-1,span_ids[1]-1] if coref_set in coref_dict: coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1]) else: coref_dict[None].append([span_ids[0]-1,span_ids[1]-1]) else: token = basedata.find("word", {"id": span}).contents[0] Loading @@ -65,6 +87,35 @@ def from_isnotes(path): word_id = int(span.split("_")[-1]) mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from)) spans[mention_id] = [word_id-1, word_id-1] if coref_set in coref_dict: coref_dict[coref_set].append([word_id-1, word_id-1]) else: coref_dict[None].append([word_id-1, word_id-1]) for m in mentions: if m.comp_from: if m.comp_from == "outsidetext": corpline = spans[m.mention_id] corpus.write("\t".join([str(c) for c in corpline])+"\n") else: if m.comp_from.startswith("markable"): corpline = spans[m.mention_id] cfroms = m.comp_from.split(";") for cfrom in cfroms: corpline += spans[cfrom] corpus.write("\t".join([str(c) for c in corpline])+"\n") corpus.write("\n") for c in coref_dict.items(): if c[0] != None: corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n") for c in coref_dict[None]: corpus.write("\t".join([str(x) for x in c])+"\n") return mentions def from_conll(path): Loading @@ -90,8 +141,9 @@ def from_conll(path): sentence_id = 0 #doc = nlp(" ".join(text)) doc = nlp.tokenizer.tokens_from_list(text) nlp.tagger(doc) nlp.parser(doc) for name, proc in nlp.pipeline: doc=proc(doc) #doc = spacy.tokens.doc.Doc(nlp.vocab, words=text, spaces=[True]*len(text)) for chunk in doc.noun_chunks: tokens = chunk.text.split(" ") Loading Loading @@ -284,6 +336,273 @@ def to_mmax(source_path, target_directory_path): print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name)) def mark_comparatives(source_path, target_directory_path): """ Source must be in conll format """ name = os.path.splitext(os.path.basename(source_path)) base_name = name[0] corpus = open(config.CORPUS_PATH+basename+".txt","w") all_tokens = list() print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name))) mentions = from_conll(source_path) comp_from_list = [m.comp_from for m in mentions] len_mentions = sum(bool(c) for c in comp_from_list) new_name = "{:0>2}_{}".format(len_mentions, base_name) if any(comp_from_list): mmax = open(os.path.join(target_directory_path, new_name+".mmax"), "w") mmax.write("""<?xml version="1.0"?> <mmax_project> <!--<sentences>002_htc_text.xml</sentences>--> <turns></turns> <words>{}</words> <gestures></gestures> <keyactions></keyactions> </mmax_project> """.format(new_name+"_words.xml")) mmax.close() target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml") source = open(source_path, 'r') target = open(target_path, 'w') word_id = 0 target.write(r"""<?xml version='1.0' encoding='UTF-8'?> <!DOCTYPE words SYSTEM "words.dtd"> <words>""") for line in source: if line.startswith("#begin document"): pass elif line.startswith("#end document"): pass elif line == "\n": corpus.write(" ".join(all_tokens)+"\n") else: naked_line = line.strip() split_line = naked_line.split() token = split_line[3] if token == "&": token = r"&" token = re.sub("<", "<", token) token = re.sub(">", ">", token) target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n') word_id += 1 target.write(r"</words>") print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name)) markable_id = 0 entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml") entity_level = open(entity_level_path, 'w') entity_level.write(r"""<?xml version='1.0'?> <!DOCTYPE markables SYSTEM "markables.dtd"> <markables xmlns="www.comp.leeds.ac.uk/markert/entity">""") coref_chains = set() for mention in mentions: if mention.comp_from: entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n') markable_id += 1 else: pass entity_level.write(r'</markables>') entity_level.close() print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name)) else: print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name)) def merge_annotation(entity_file, annotation_file): path = entity_file split_path = path.split(r"/") root, file_name = "/".join(split_path[:-1]), split_path[-1] processed = [] for path, subdirs, files in os.walk(config.CORPUS_PATH): for name in files: processed.append(name) mentions = [] file_root = file_name[:-5] new_root = "_".join(file_name[:-5].split("_")[1:]) if file_name.endswith(".mmax") and new_root+".ncr" not in processed: basedata = BeautifulSoup(open(root+"/Basedata/"+file_root+"_words.xml", "r").read(), features="lxml") entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml") coref = BeautifulSoup(open(root+"/Markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml") spans = dict() word_dict = dict() coref_dict = {None:[]} span_set = set() match_count = 0 total_count = 0 bd = basedata.contents[2].contents[0].contents[0].find_all() for entity in tqdm(entities.contents[2].contents[0].contents[0].find_all()): span = entity["span"] tokens = [] mention_id = entity["id"] # comparative: either "outsidetext" or antecedent id comp_from = None try: coref_set = coref.find("markable", {"span": span})["coref_set"] if coref_set not in coref_dict: coref_dict[coref_set] = list() except: coref_set = None if ".." in span: split_span = span.split("..") span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]] for i in range(span_ids[0],span_ids[1]+1): token = basedata.find("word", {"id": "word_"+str(i)}).contents[0] tokens.append(token) mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from)) spans[mention_id] = [span_ids[0]-1,span_ids[1]-1] span_set.add((span_ids[0]-1,span_ids[1]-1)) if coref_set in coref_dict: coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1]) else: coref_dict[None].append([span_ids[0]-1,span_ids[1]-1]) else: token = basedata.find("word", {"id": span}).contents[0] tokens.append(token) word_id = int(span.split("_")[-1]) mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from)) spans[mention_id] = [word_id-1, word_id-1] span_set.add((word_id-1, word_id-1)) if coref_set in coref_dict: coref_dict[coref_set].append([word_id-1, word_id-1]) else: coref_dict[None].append([word_id-1, word_id-1]) path = annotation_file split_path = path.split(r"/") root, file_name = "/".join(split_path[:-1]), split_path[-1] file_root = file_name[:-5] new_root = "_".join(file_name[:-5].split("_")[1:]) new_spans = dict() entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml") word_dict = dict() mention_ids = -1 for entity in entities.contents[2].contents[0].contents[0].find_all(): span = entity["span"] tokens = [] mention_id = entity["id"] comp_from = None if entity["information_status"] == "mediated": try: if entity["mediated_type"] == "comparative": if entity["comparative_type"] == "withintext" and entity["comp_from"] != None: comp_from = entity["comp_from"] elif entity["comparative_type"] == "outsidetext": comp_from = "outsidetext" except: pass if ".." in span: split_span = span.split("..") span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]] mentions.append(Mention(None, [span_ids[0],span_ids[1]], mention_ids, None, comp_from)) new_spans[mention_id] = [span_ids[0]-1,span_ids[1]-1] spans[mention_ids] = [span_ids[0]-1,span_ids[1]-1] test_span = [span_ids[0]-1,span_ids[1]-1] mention_ids -= 1 else: word_id = int(span.split("_")[-1]) mentions.append(Mention(None, [word_id, word_id], mention_ids, None, comp_from)) new_spans[mention_id] = [word_id-1, word_id-1] spans[mention_ids] = [word_id-1, word_id-1] test_span = [word_id-1, word_id-1] mention_ids -= 1 if not comp_from: total_count += 1 if (test_span[0],test_span[1]) in span_set: match_count += 1 comp_froms = [m.comp_from for m in mentions] if any(comp_froms): corpus = open(config.CORPUS_PATH+new_root+".ncr","w") for i, word in enumerate(bd): corpus.write(str(i)+"\t"+word.contents[0]+"\n") corpus.write("\n") for m in mentions: if m.comp_from: if m.comp_from == "outsidetext": corpline = spans[m.mention_id] corpus.write("\t".join([str(c) for c in corpline])+"\n") else: if m.comp_from.startswith("markable"): corpline = spans[m.mention_id] cfroms = m.comp_from.split(";") for cfrom in cfroms: corpline += new_spans[cfrom] corpus.write("\t".join([str(c) for c in corpline])+"\n") corpus.write("\n") for c in coref_dict.items(): if c[0] != None: corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n") for c in coref_dict[None]: corpus.write("\t".join([str(x) for x in c])+"\n") return match_count, total_count return 0, 0 if __name__ == "__main__": # ISnotes #root = config.DATA_PATH Loading @@ -295,14 +614,32 @@ if __name__ == "__main__": # conll/OntoNotes root = config.CONLL_PATH files = [] for x in os.walk(root): for y in x[2]: #print(y) if y.endswith(".v4_gold_conll"): files.append(x[0]+"/"+y) #files = [] #for x in os.walk(root): #for y in x[2]: ##print(y) #if y.endswith(".v4_gold_conll"): #files.append(x[0]+"/"+y) #for f in files: #to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/") other_files = os.listdir("/home/students/zimmermann/Projects/ncr/annotation/Entity/") root = "/home/students/zimmermann/Projects/ncr/annotation/Freestyle/" files = os.listdir(root) mc, tc = 0,0 for f in files: to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/") for f in sorted(files)[::-1]: if f.endswith(".mmax"): split_path = f.split(r"/") _, file_name = "/".join(split_path[:-1]), split_path[-1] print(f) m,t = merge_annotation("/home/students/zimmermann/Projects/ncr/annotation/Entity/"+[s for s in other_files if s.endswith("_".join(f.split("_")[1:]))][0], root+f) mc += m tc += t print(mc, tc, 0 if tc==0 else mc/tc)
source/new_baselines.py 0 → 100644 +184 −0 Original line number Diff line number Diff line import os import argparse import config import spacy import en_core_web_sm import gensim import numpy as np from gensim.models import word2vec from gensim.models import KeyedVectors as kv nlp = en_core_web_sm.load() fname = config.EMBEDDINGS_PATH print("Loading model.", end="\r") try: model = kv.load(fname) except: model = kv.load_word2vec_format(fname, binary=False) print("Done. ") def closest_preceding_mention(toks, ents, ana): best_ment = (0,0) for ent in ents: for ment in ent: if ana[0] - ment[1] < ana[0] - best_ment[1] and ment[1] < ana[0]: best_ment = ment return {best_ment} def head_match(toks, ents, ana, window=20): best_ment = (0,0) ana_toks = toks[ana[0]:ana[1]+1] ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks] doc = nlp.tokenizer.tokens_from_list(ana_toks) for name, proc in nlp.pipeline: doc=proc(doc) ana_head = [token for token in doc if token.head == token][0].lemma_ for ent in ents: for ment in ent: ment_toks = toks[ment[0]:ment[1]+1] ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks] doc = nlp.tokenizer.tokens_from_list(ment_toks) for name, proc in nlp.pipeline: doc=proc(doc) try: ment_head = [token for token in doc if token.head == token][0].lemma_ except IndexError: continue if ana_head == ment_head: if ana[0] - ment[1] < ana[0] - best_ment[1] and ment[1] < ana[0] and ana[0] - ment[1] < window: best_ment = ment if best_ment == (0,0): return closest_preceding_mention(toks, ents, ana) else: return {best_ment} def highest_embedding_similarity(toks, ents, ana): best_ment = (0,0) best_sim = 0 ana_toks = toks[ana[0]:ana[1]+1] ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks] doc = nlp.tokenizer.tokens_from_list(ana_toks) for name, proc in nlp.pipeline: doc=proc(doc) ana_head = [token for token in doc if token.head == token][0].text for ent in ents: for ment in ent: ment_toks = toks[ment[0]:ment[1]+1] ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks] doc = nlp.tokenizer.tokens_from_list(ment_toks) for name, proc in nlp.pipeline: doc=proc(doc) try: ment_head = [token for token in doc if token.head == token][0].text except IndexError: continue sim = 1/(model.wv.wmdistance(ment_head, ana_head)+1) if best_sim <= sim and ment[1] < ana[0]: print(ment_head, sim, ana_head) best_ment = ment best_sim = sim if best_ment == (0,0): return closest_preceding_mention(toks, ents, ana) else: return {best_ment} def most_salient_entity(toks, ents, ana): best_ent = {(0,0)} max_ent = 0 for ent in ents: if len(ent) >= max_ent and any([ment[1] < ana[0] for ment in ent]): max_ent = len(ent) best_ent = ent return best_ent def largest_entity_span(toks, ents, ana): best_ent = {(0,0)} max_span = 0 for ent in ents: if ent: start = min([x for x,y in ent]) end = max([y for x,y in ent]) span = end-start if span >= max_span and any([ment[1] < ana[0] for ment in ent]): max_span = span best_ent = ent if best_ment == {(0,0)}: return closest_preceding_mention(toks, ents, ana) else: return {best_ment} def import_doc(path): doc = open(path, 'r').read() tokens, comps, coref = doc.split("\n\n") tokens = [line.strip().split()[1] for line in tokens.split("\n")] anaphora = dict() for line in comps.split("\n"): line = [int(l) for l in line.strip().split()] if len(line) > 2: anaphora[(line[0],line[1])] = {(line[i],line[i+1]) for i in range(2,len(line),2)} else: anaphora[(line[0],line[1])] = set() entities = list() for line in coref.split("\n"): line = [int(l) for l in line.strip().split()] entities.append([(line[i],line[i+1]) for i in range(0,len(line),2)]) return tokens, anaphora, entities if __name__ == "__main__": root = "/home/students/zimmermann/Projects/ncr/corpus/" files = os.listdir(root) baseline = head_match correct = 0 incorrect = 0 for f in files: path = root+f print(path) toks, anas, ents = import_doc(path) for ana in anas.keys(): if anas[ana]: #ents = set.union(*anas.values(), *ents) system = baseline(toks, ents, ana) #if anas[ana] == system: if anas[ana] & set(system) != set(): correct += 1 else: incorrect += 1 print(correct, incorrect, correct/(correct+incorrect))
source/remove_coordinated_syntactic_anaphora.py 0 → 100644 +77 −0 File added.Preview size limit exceeded, changes collapsed. Show changes