Update extraction algorithm (0dc96118) · Commits · Victor Zimmermann / Neural Resolution of Comparative Anaphora

source/extraction.py

+125 −19

Original line number	Diff line number	Diff line
		@@ -16,7 +16,7 @@ def from_isnotes(path):

		if file_name.endswith(".mmax"):

		print(file_name)
		#print(file_name)

		file_root = file_name[:-5]

		@@ -81,15 +81,14 @@ def from_conll(path):
		comp_count = 0
		ment_count = 0

		for line in conll:
		if line.startswith("#begin document"):
		stack = list()
		coref_stack = dict()

		for line in conll:
		if line.startswith("#begin document"):
		pass
		elif line.startswith("#end document"):
		if comp_count > 0:
		for m in mentions:
		print(m)
		return (ment_count, comp_count, mentions)
		pass
		elif line == "\n":
		stack = list()
		else:
		@@ -130,6 +129,7 @@ def from_conll(path):
		rules = list()

		rules.append(start[3]=="JJR")

		rules.append(text[start[0]].lower() == "more"
		and len(poss) > start[0]+1
		and poss[start[0]+1] == "JJ")
		@@ -159,7 +159,121 @@ def from_conll(path):

		word_id += 1

		return (ment_count, comp_count, mentions)
		return mentions


		def to_mmax(source_path, target_directory_path):
		""" Source must be in conll format

		"""

		name = os.path.splitext(os.path.basename(source_path))
		base_name = name[0]

		print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name)))

		mentions = from_conll(source_path)

		comp_from_list = [m.comp_from for m in mentions]

		len_mentions = sum(bool(c) for c in comp_from_list)


		new_name = "{:0>2}_{}".format(len_mentions, base_name)

		if any(comp_from_list):

		mmax = open(os.path.join(target_directory_path, new_name+".mmax"), "w")
		mmax.write("""<?xml version="1.0"?>
		<mmax_project>
		<!--<sentences>002_htc_text.xml</sentences>-->
		<turns></turns>
		<words>{}</words>
		<gestures></gestures>
		<keyactions></keyactions>
		</mmax_project>
		""".format(new_name+"_words.xml"))
		mmax.close()

		target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml")

		source = open(source_path, 'r')
		target = open(target_path, 'w')

		word_id = 0


		target.write(r"""<?xml version='1.0' encoding='UTF-8'?>
		<!DOCTYPE words SYSTEM "words.dtd">
		<words>""")

		for line in source:
		if line.startswith("#begin document"):
		pass
		elif line.startswith("#end document"):
		pass
		elif line == "\n":
		pass
		else:
		naked_line = line.strip()
		split_line = naked_line.split()

		token = split_line[3]
		if token == "&":
		token = r"&"
		token = re.sub("<", "<", token)
		token = re.sub(">", ">", token)

		target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n')

		word_id += 1

		target.write(r"</words>")

		print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name))

		markable_id = 0

		entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml")
		entity_level = open(entity_level_path, 'w')
		entity_level.write(r"""<?xml version='1.0'?>
		<!DOCTYPE markables SYSTEM "markables.dtd">
		<markables xmlns="www.comp.leeds.ac.uk/markert/entity">""")

		coref_level_path = os.path.join(target_directory_path+"Markables", new_name+"_coref_level.xml")
		coref_level = open(coref_level_path, 'w')
		coref_level.write(r"""<?xml version='1.0'?>
		<!DOCTYPE markables SYSTEM "markables.dtd">
		<markables xmlns="www.eml.org/NameSpaces/coref">""")

		coref_chains = set()

		for mention in mentions:
		if mention.comp_from:
		entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n')
		markable_id += 1
		else:
		entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n')
		markable_id += 1

		if mention.coref_set:
		coref_chains.add(mention.coref_set)
		coref_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" coref_set="set_'+str(mention.coref_set)+r'" mmax_level="coref" />'+'\n')
		markable_id += 1
		else:
		pass

		entity_level.write(r'</markables>')
		coref_level.write(r'</markables>')

		entity_level.close()
		coref_level.close()

		print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name))
		print("[NCR] Writing \033[92m{:>4}\033[0m coreference chain(s) to \033[1m{}_coref_level.xml\033[0m.".format(len(coref_chains), new_name))

		else:
		print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name))


		if __name__ == "__main__":
		@@ -176,19 +290,11 @@ if __name__ == "__main__":
		files = []
		for x in os.walk(root):
		for y in x[2]:
		print(y)
		#print(y)
		if y.endswith(".v4_gold_conll"):
		files.append(x[0]+"/"+y)

		mcount = 0
		ccount = 0
		dist = np.zeros(26)
		for f in files:
		plus_m, plus_c, mentions = from_conll(f)
		if plus_c > 0:
		dist[plus_c] += 1
		mcount += plus_m
		ccount += plus_c
		print(mcount, ccount)
		print(dist[1:])
		to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")

source/model.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -20,3 +20,4 @@ class Mention:
		return '\033[92m'+plain_repr+'\033[0m'
		else:
		return plain_repr