Major update to annotation and preprocessing. (8c25f2d7) · Commits · Victor Zimmermann / Neural Resolution of Comparative Anaphora

source/baselines.py

+2 −3

Original line number	Diff line number	Diff line
		@@ -18,7 +18,6 @@ except:
		model = kv.load_word2vec_format(fname, binary=False)
		print("Done. ")


		def next_mention(mentions, anaphora):
		sims = [1/(anaphora.span[0] - mention.span[1]) if mention.span[1] < anaphora.span[0] else 0 for mention in mentions]
		return mentions[np.argmax(sims)]
		@@ -57,9 +56,9 @@ if __name__ == "__main__":
		for f in files[:]:

		print("Fetching test data...")
		mentions = extraction.get_mentions(root+f)
		mentions = extraction.from_isnotes(root+f)

		comparatives = [m for m in mentions if m.comp_from]
		comparatives = [m for m in mentions if m.comp_from and m.comp_from != "outsidetext"]

		entities = []
		entity_dict = {}

source/config.py

+1 −0

Original line number	Diff line number	Diff line
		EMBEDDINGS_PATH = "/softpro/ss18/kernseife/kernseife/data/en/embeddings/glove.twitter.27B.100d.txt"
		DATA_PATH = "/proj/zimmermann/isnotes/ISClean/"
		CONLL_PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
		CORPUS_PATH = "/home/students/zimmermann/Projects/ncr/corpus/"

		COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

source/extraction.py

+350 −13

Original line number	Diff line number	Diff line
		@@ -3,11 +3,14 @@ import os
		import config
		import re
		import spacy
		import en_core_web_sm
		import numpy as np

		from model import Mention, Entity
		from tqdm import tqdm

		nlp = spacy.load("en_core_web_lg")
		nlp = en_core_web_sm.load()
		#nlp = spacy.load("en")

		def from_isnotes(path):

		@@ -26,6 +29,17 @@ def from_isnotes(path):
		entities = BeautifulSoup(open(root+"/markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml")
		coref = BeautifulSoup(open(root+"/markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml")

		spans = dict()
		word_dict = dict()
		coref_dict = {None:[]}

		corpus = open(config.CORPUS_PATH+file_root+".ncr","w")

		for i, word in enumerate(basedata.contents[2].contents[0].contents[0].find_all()):
		corpus.write(str(i)+"\t"+word.contents[0]+"\n")

		corpus.write("\n")

		for entity in entities.contents[2].contents[0].contents[0].find_all():

		span = entity["span"]
		@@ -45,6 +59,8 @@ def from_isnotes(path):

		try:
		coref_set = coref.find("markable", {"span": span})["coref_set"]
		if coref_set not in coref_dict:
		coref_dict[coref_set] = list()
		except:
		coref_set = None

		@@ -57,6 +73,12 @@ def from_isnotes(path):

		mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from))

		spans[mention_id] = [span_ids[0]-1,span_ids[1]-1]
		if coref_set in coref_dict:
		coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1])
		else:
		coref_dict[None].append([span_ids[0]-1,span_ids[1]-1])

		else:

		token = basedata.find("word", {"id": span}).contents[0]
		@@ -65,6 +87,35 @@ def from_isnotes(path):
		word_id = int(span.split("_")[-1])
		mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from))

		spans[mention_id] = [word_id-1, word_id-1]
		if coref_set in coref_dict:
		coref_dict[coref_set].append([word_id-1, word_id-1])
		else:
		coref_dict[None].append([word_id-1, word_id-1])

		for m in mentions:
		if m.comp_from:
		if m.comp_from == "outsidetext":
		corpline = spans[m.mention_id]
		corpus.write("\t".join([str(c) for c in corpline])+"\n")
		else:
		if m.comp_from.startswith("markable"):
		corpline = spans[m.mention_id]
		cfroms = m.comp_from.split(";")
		for cfrom in cfroms:
		corpline += spans[cfrom]
		corpus.write("\t".join([str(c) for c in corpline])+"\n")

		corpus.write("\n")


		for c in coref_dict.items():
		if c[0] != None:
		corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n")
		for c in coref_dict[None]:
		corpus.write("\t".join([str(x) for x in c])+"\n")


		return mentions

		def from_conll(path):
		@@ -90,8 +141,9 @@ def from_conll(path):
		sentence_id = 0
		#doc = nlp(" ".join(text))
		doc = nlp.tokenizer.tokens_from_list(text)
		nlp.tagger(doc)
		nlp.parser(doc)
		for name, proc in nlp.pipeline:
		doc=proc(doc)
		#doc = spacy.tokens.doc.Doc(nlp.vocab, words=text, spaces=[True]*len(text))

		for chunk in doc.noun_chunks:
		tokens = chunk.text.split(" ")
		@@ -284,6 +336,273 @@ def to_mmax(source_path, target_directory_path):
		print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name))


		def mark_comparatives(source_path, target_directory_path):
		""" Source must be in conll format

		"""

		name = os.path.splitext(os.path.basename(source_path))
		base_name = name[0]

		corpus = open(config.CORPUS_PATH+basename+".txt","w")
		all_tokens = list()

		print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name)))

		mentions = from_conll(source_path)

		comp_from_list = [m.comp_from for m in mentions]

		len_mentions = sum(bool(c) for c in comp_from_list)

		new_name = "{:0>2}_{}".format(len_mentions, base_name)

		if any(comp_from_list):

		mmax = open(os.path.join(target_directory_path, new_name+".mmax"), "w")
		mmax.write("""<?xml version="1.0"?>
		<mmax_project>
		<!--<sentences>002_htc_text.xml</sentences>-->
		<turns></turns>
		<words>{}</words>
		<gestures></gestures>
		<keyactions></keyactions>
		</mmax_project>
		""".format(new_name+"_words.xml"))
		mmax.close()

		target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml")

		source = open(source_path, 'r')
		target = open(target_path, 'w')

		word_id = 0


		target.write(r"""<?xml version='1.0' encoding='UTF-8'?>
		<!DOCTYPE words SYSTEM "words.dtd">
		<words>""")

		for line in source:
		if line.startswith("#begin document"):
		pass
		elif line.startswith("#end document"):
		pass
		elif line == "\n":
		corpus.write(" ".join(all_tokens)+"\n")
		else:
		naked_line = line.strip()
		split_line = naked_line.split()

		token = split_line[3]
		if token == "&":
		token = r"&"
		token = re.sub("<", "<", token)
		token = re.sub(">", ">", token)

		target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n')

		word_id += 1

		target.write(r"</words>")

		print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name))

		markable_id = 0

		entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml")
		entity_level = open(entity_level_path, 'w')
		entity_level.write(r"""<?xml version='1.0'?>
		<!DOCTYPE markables SYSTEM "markables.dtd">
		<markables xmlns="www.comp.leeds.ac.uk/markert/entity">""")

		coref_chains = set()

		for mention in mentions:
		if mention.comp_from:
		entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n')
		markable_id += 1
		else:
		pass

		entity_level.write(r'</markables>')

		entity_level.close()

		print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name))

		else:
		print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name))


		def merge_annotation(entity_file, annotation_file):

		path = entity_file

		split_path = path.split(r"/")
		root, file_name = "/".join(split_path[:-1]), split_path[-1]

		processed = []
		for path, subdirs, files in os.walk(config.CORPUS_PATH):
		for name in files:
		processed.append(name)

		mentions = []
		file_root = file_name[:-5]
		new_root = "_".join(file_name[:-5].split("_")[1:])


		if file_name.endswith(".mmax") and new_root+".ncr" not in processed:
		basedata = BeautifulSoup(open(root+"/Basedata/"+file_root+"_words.xml", "r").read(), features="lxml")
		entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml")
		coref = BeautifulSoup(open(root+"/Markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml")

		spans = dict()
		word_dict = dict()
		coref_dict = {None:[]}
		span_set = set()
		match_count = 0
		total_count = 0

		bd = basedata.contents[2].contents[0].contents[0].find_all()

		for entity in tqdm(entities.contents[2].contents[0].contents[0].find_all()):

		span = entity["span"]
		tokens = []
		mention_id = entity["id"]

		# comparative: either "outsidetext" or antecedent id
		comp_from = None

		try:
		coref_set = coref.find("markable", {"span": span})["coref_set"]
		if coref_set not in coref_dict:
		coref_dict[coref_set] = list()
		except:
		coref_set = None

		if ".." in span:

		split_span = span.split("..")
		span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]]
		for i in range(span_ids[0],span_ids[1]+1):
		token = basedata.find("word", {"id": "word_"+str(i)}).contents[0]
		tokens.append(token)

		mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from))

		spans[mention_id] = [span_ids[0]-1,span_ids[1]-1]
		span_set.add((span_ids[0]-1,span_ids[1]-1))
		if coref_set in coref_dict:
		coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1])
		else:
		coref_dict[None].append([span_ids[0]-1,span_ids[1]-1])

		else:

		token = basedata.find("word", {"id": span}).contents[0]
		tokens.append(token)

		word_id = int(span.split("_")[-1])
		mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from))

		spans[mention_id] = [word_id-1, word_id-1]
		span_set.add((word_id-1, word_id-1))
		if coref_set in coref_dict:
		coref_dict[coref_set].append([word_id-1, word_id-1])
		else:
		coref_dict[None].append([word_id-1, word_id-1])

		path = annotation_file

		split_path = path.split(r"/")
		root, file_name = "/".join(split_path[:-1]), split_path[-1]
		file_root = file_name[:-5]
		new_root = "_".join(file_name[:-5].split("_")[1:])
		new_spans = dict()

		entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml")
		word_dict = dict()

		mention_ids = -1
		for entity in entities.contents[2].contents[0].contents[0].find_all():

		span = entity["span"]
		tokens = []
		mention_id = entity["id"]

		comp_from = None
		if entity["information_status"] == "mediated":
		try:
		if entity["mediated_type"] == "comparative":
		if entity["comparative_type"] == "withintext" and entity["comp_from"] != None:
		comp_from = entity["comp_from"]
		elif entity["comparative_type"] == "outsidetext":
		comp_from = "outsidetext"
		except:
		pass


		if ".." in span:

		split_span = span.split("..")
		span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]]

		mentions.append(Mention(None, [span_ids[0],span_ids[1]], mention_ids, None, comp_from))

		new_spans[mention_id] = [span_ids[0]-1,span_ids[1]-1]
		spans[mention_ids] = [span_ids[0]-1,span_ids[1]-1]
		test_span = [span_ids[0]-1,span_ids[1]-1]
		mention_ids -= 1

		else:

		word_id = int(span.split("_")[-1])
		mentions.append(Mention(None, [word_id, word_id], mention_ids, None, comp_from))

		new_spans[mention_id] = [word_id-1, word_id-1]
		spans[mention_ids] = [word_id-1, word_id-1]
		test_span = [word_id-1, word_id-1]
		mention_ids -= 1

		if not comp_from:
		total_count += 1
		if (test_span[0],test_span[1]) in span_set:
		match_count += 1

		comp_froms = [m.comp_from for m in mentions]
		if any(comp_froms):
		corpus = open(config.CORPUS_PATH+new_root+".ncr","w")
		for i, word in enumerate(bd):
		corpus.write(str(i)+"\t"+word.contents[0]+"\n")
		corpus.write("\n")

		for m in mentions:
		if m.comp_from:
		if m.comp_from == "outsidetext":
		corpline = spans[m.mention_id]
		corpus.write("\t".join([str(c) for c in corpline])+"\n")
		else:
		if m.comp_from.startswith("markable"):
		corpline = spans[m.mention_id]
		cfroms = m.comp_from.split(";")
		for cfrom in cfroms:
		corpline += new_spans[cfrom]
		corpus.write("\t".join([str(c) for c in corpline])+"\n")

		corpus.write("\n")

		for c in coref_dict.items():
		if c[0] != None:
		corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n")
		for c in coref_dict[None]:
		corpus.write("\t".join([str(x) for x in c])+"\n")

		return match_count, total_count
		return 0, 0


		if __name__ == "__main__":
		# ISnotes
		#root = config.DATA_PATH
		@@ -295,14 +614,32 @@ if __name__ == "__main__":
		# conll/OntoNotes
		root = config.CONLL_PATH

		files = []
		for x in os.walk(root):
		for y in x[2]:
		#print(y)
		if y.endswith(".v4_gold_conll"):
		files.append(x[0]+"/"+y)
		#files = []
		#for x in os.walk(root):
		#for y in x[2]:
		##print(y)
		#if y.endswith(".v4_gold_conll"):
		#files.append(x[0]+"/"+y)

		#for f in files:
		#to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")

		other_files = os.listdir("/home/students/zimmermann/Projects/ncr/annotation/Entity/")

		root = "/home/students/zimmermann/Projects/ncr/annotation/Freestyle/"

		files = os.listdir(root)

		mc, tc = 0,0

		for f in files:
		to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")
		for f in sorted(files)[::-1]:
		if f.endswith(".mmax"):
		split_path = f.split(r"/")
		_, file_name = "/".join(split_path[:-1]), split_path[-1]
		print(f)
		m,t = merge_annotation("/home/students/zimmermann/Projects/ncr/annotation/Entity/"+[s for s in other_files if s.endswith("_".join(f.split("_")[1:]))][0], root+f)
		mc += m
		tc += t

		print(mc, tc, 0 if tc==0 else mc/tc)

source/new_baselines.py

0 → 100644

+184 −0

Original line number	Diff line number	Diff line
		import os
		import argparse
		import config
		import spacy
		import en_core_web_sm
		import gensim
		import numpy as np

		from gensim.models import word2vec
		from gensim.models import KeyedVectors as kv

		nlp = en_core_web_sm.load()

		fname = config.EMBEDDINGS_PATH
		print("Loading model.", end="\r")
		try:
		model = kv.load(fname)
		except:
		model = kv.load_word2vec_format(fname, binary=False)
		print("Done. ")

		def closest_preceding_mention(toks, ents, ana):
		best_ment = (0,0)
		for ent in ents:
		for ment in ent:
		if ana[0] - ment[1] < ana[0] - best_ment[1] and ment[1] < ana[0]:
		best_ment = ment
		return {best_ment}

		def head_match(toks, ents, ana, window=20):
		best_ment = (0,0)

		ana_toks = toks[ana[0]:ana[1]+1]
		ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks]

		doc = nlp.tokenizer.tokens_from_list(ana_toks)
		for name, proc in nlp.pipeline:
		doc=proc(doc)

		ana_head = [token for token in doc if token.head == token][0].lemma_

		for ent in ents:
		for ment in ent:
		ment_toks = toks[ment[0]:ment[1]+1]
		ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks]

		doc = nlp.tokenizer.tokens_from_list(ment_toks)
		for name, proc in nlp.pipeline:
		doc=proc(doc)

		try:
		ment_head = [token for token in doc if token.head == token][0].lemma_
		except IndexError:
		continue

		if ana_head == ment_head:
		if ana[0] - ment[1] < ana[0] - best_ment[1] and ment[1] < ana[0] and ana[0] - ment[1] < window:
		best_ment = ment

		if best_ment == (0,0):
		return closest_preceding_mention(toks, ents, ana)
		else:
		return {best_ment}

		def highest_embedding_similarity(toks, ents, ana):
		best_ment = (0,0)
		best_sim = 0

		ana_toks = toks[ana[0]:ana[1]+1]
		ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks]

		doc = nlp.tokenizer.tokens_from_list(ana_toks)
		for name, proc in nlp.pipeline:
		doc=proc(doc)

		ana_head = [token for token in doc if token.head == token][0].text

		for ent in ents:
		for ment in ent:
		ment_toks = toks[ment[0]:ment[1]+1]
		ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks]

		doc = nlp.tokenizer.tokens_from_list(ment_toks)
		for name, proc in nlp.pipeline:
		doc=proc(doc)

		try:
		ment_head = [token for token in doc if token.head == token][0].text
		except IndexError:
		continue

		sim = 1/(model.wv.wmdistance(ment_head, ana_head)+1)

		if best_sim <= sim and ment[1] < ana[0]:
		print(ment_head, sim, ana_head)
		best_ment = ment
		best_sim = sim

		if best_ment == (0,0):
		return closest_preceding_mention(toks, ents, ana)
		else:
		return {best_ment}


		def most_salient_entity(toks, ents, ana):
		best_ent = {(0,0)}
		max_ent = 0
		for ent in ents:
		if len(ent) >= max_ent and any([ment[1] < ana[0] for ment in ent]):
		max_ent = len(ent)
		best_ent = ent
		return best_ent

		def largest_entity_span(toks, ents, ana):
		best_ent = {(0,0)}
		max_span = 0
		for ent in ents:
		if ent:
		start = min([x for x,y in ent])
		end = max([y for x,y in ent])
		span = end-start
		if span >= max_span and any([ment[1] < ana[0] for ment in ent]):
		max_span = span
		best_ent = ent

		if best_ment == {(0,0)}:
		return closest_preceding_mention(toks, ents, ana)
		else:
		return {best_ment}

		def import_doc(path):
		doc = open(path, 'r').read()

		tokens, comps, coref = doc.split("\n\n")
		tokens = [line.strip().split()[1] for line in tokens.split("\n")]
		anaphora = dict()
		for line in comps.split("\n"):
		line = [int(l) for l in line.strip().split()]
		if len(line) > 2:
		anaphora[(line[0],line[1])] = {(line[i],line[i+1]) for i in range(2,len(line),2)}
		else:
		anaphora[(line[0],line[1])] = set()
		entities = list()
		for line in coref.split("\n"):
		line = [int(l) for l in line.strip().split()]
		entities.append([(line[i],line[i+1]) for i in range(0,len(line),2)])

		return tokens, anaphora, entities




		if __name__ == "__main__":

		root = "/home/students/zimmermann/Projects/ncr/corpus/"
		files = os.listdir(root)

		baseline = head_match

		correct = 0
		incorrect = 0

		for f in files:
		path = root+f
		print(path)
		toks, anas, ents = import_doc(path)

		for ana in anas.keys():
		if anas[ana]:
		#ents = set.union(anas.values(), ents)
		system = baseline(toks, ents, ana)
		#if anas[ana] == system:
		if anas[ana] & set(system) != set():
		correct += 1
		else:
		incorrect += 1

		print(correct, incorrect, correct/(correct+incorrect))

source/remove_coordinated_syntactic_anaphora.py

0 → 100644

+77 −0

File added.

Preview size limit exceeded, changes collapsed.