Update candidate extraction files (3b9b2179) · Commits · Victor Zimmermann / Neural Resolution of Comparative Anaphora

comparatives/fetch.py

+51 −13

Original line number	Diff line number	Diff line
		@@ -4,24 +4,34 @@ Fetches all documents including comparative constructions.

		import os
		import sys
		import re

		PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
		TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/"
		COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

		def fetch():

		def fetch(PATH, TARGET_PATH, qual="gold"):
		jjr_count = 0
		rbr_count = 0
		jj_count = 0
		others_count = 0
		for root, dirs, files in os.walk(PATH):
		for f in files:
		if f.endswith(".v4_gold_conll"):
		if f.endswith(".v4_{}_conll".format(qual)):
		with open(os.path.join(root, f), "r") as document:
		current_word_id = 0
		contains_comparative = 0
		current_doc_text = ""
		current_doc_id = 0

		last_pos = ""
		last_word = ""
		stack = []
		np_with_jj = False
		in_comp = False
		buff="" #buffer saves annotation after * for next line

		for line in document:
		if line.strip():

		@@ -35,7 +45,7 @@ def fetch():
		if contains_comparative:
		target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id)
		open(target, "w").write(current_doc_text)
		print("Writing to "+target+".")
		#print("Writing to "+target+".")
		contains_comparative = 0

		current_doc_text = ""
		@@ -43,22 +53,43 @@ def fetch():

		current_doc_text += line

		if cols[4] == "JJR":
		synt = cols[5]

		new_tags = re.findall("[A-Z]*", synt)
		open_brackets = synt.count("(")
		closed_brackets = synt.count(")")

		stack+=new_tags

		if "NP" in new_tags:
		np_start_word_id = current_word_id
		np_with_jj = []

		if cols[4] == "JJ" and "NP" in stack and cols[3] in COMPARATIVES:
		np_with_jj.append(cols[3])


		if cols[4] == "JJR" and "NP" in stack:
		contains_comparative += 1
		jjr_count += 1
		print("JJR")
		elif cols[4] == "JJ" and last_pos == "RBR":
		elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack:
		contains_comparative += 1
		rbr_count += 1
		print("RBR-JJ")
		elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES:
		elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj:
		contains_comparative += 1
		jj_count += 1
		print("JJ-NN")
		elif cols[3].lower() == "others" and "NP" in stack:
		contains_comparative += 1
		others_count += 1

		last_pos = cols[4]
		last_word = cols[3]

		if closed_brackets > 0:
		del stack[-1*closed_brackets:]

		current_word_id += 1

		else:
		current_doc_text += line

		@@ -66,9 +97,16 @@ def fetch():
		if contains_comparative:
		target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id)
		open(target, "w").write(current_doc_text)
		print("Writing to "+target+".")

		print(jjr_count, rbr_count, jj_count)
		print(jjr_count, rbr_count, jj_count, others_count)

		if __name__ == "__main__":
		fetch()
		PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
		TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/"
		fetch(PATH, TARGET_PATH)
		PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/test/data/english/"
		TARGET_PATH = "/proj/zimmermann/comparatives/corpus/test/"
		fetch(PATH, TARGET_PATH, "auto")
		PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/development/data/english/"
		TARGET_PATH = "/proj/zimmermann/comparatives/corpus/dev/"
		fetch(PATH, TARGET_PATH)

comparatives/format.py

+114 −15

Original line number	Diff line number	Diff line
		@@ -4,32 +4,52 @@ Fetches all documents including comparative constructions.

		import os
		import sys
		import re

		PATH = "/proj/zimmermann/comparatives/corpus/"
		TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/"
		COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

		def format():

		c = [0,0,0,0]

		for root, dirs, files in os.walk(PATH):
		for f in files:
		with open(os.path.join(root, f), "r") as document:
		current_word_id = 0
		markable_id = 0
		comp_id = -1
		set_id = 0

		jjr_count = 0
		rbr_count = 0
		jj_count = 0

		mmax = open(TARGET_PATH+f+".mmax",'w')

		mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f))

		mmax.close()

		xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w')

		xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>')

		comp_level = open(TARGET_PATH+"Markables/"+f+"_comparison_level.xml",'w')
		comp_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/comparison">')

		coref_level = open(TARGET_PATH+"Markables/"+f+"_coref_level.xml",'w')
		coref_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/coref">')



		last_pos = ""
		last_word = ""
		stack = list()
		coref_stack = dict()
		np_with_jj = False
		comp_word = None
		in_comp = 0
		buff="" #buffer saves annotation after * for next line

		for line in document:
		if line.strip():

		@@ -40,23 +60,102 @@ def format():

		xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3]))

		if cols[4] == "JJR":
		jjr_count += 1
		print("JJR")
		elif cols[4] == "JJ" and last_pos == "RBR":
		rbr_count += 1
		print("RBR-JJ")
		elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES:
		jj_count += 1
		print("JJ-NN")

		synt = cols[5]

		new_tags = re.findall("[A-Z]+", synt)
		open_brackets = synt.count("(")
		closed_brackets = synt.count(")")

		stack+=new_tags

		if "NP" in new_tags:
		np_start_word_id = current_word_id
		np_with_jj = False

		if closed_brackets and "NP" in stack[-1*closed_brackets:] and in_comp and np_start_word_id <= comp_id <= current_word_id:
		comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, np_start_word_id, current_word_id, set_id))
		in_comp = False
		markable_id += 1
		set_id += 1


		if cols[4] == "JJ" and cols[3] in COMPARATIVES:
		np_with_jj = True
		comp_id = current_word_id

		if cols[4] == "JJR" and "NP" in stack:
		set_id += 1
		comp_id = current_word_id
		in_comp = True
		c[0] += 1
		comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id))
		markable_id += 1
		elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack:
		set_id += 1
		comp_id = current_word_id
		in_comp = True
		c[1] += 1
		comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id-1, current_word_id ,set_id))
		markable_id += 1
		elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj:
		set_id += 1
		in_comp = True
		c[2] += 1
		comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, comp_id, set_id))
		markable_id += 1
		np_with_jj = False
		elif cols[3].lower() == "others":
		set_id += 1
		comp_id = current_word_id
		c[3] += 1
		comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric" comparison_type="comparative" comparison_class="set_{}" mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id))
		comp_level.write('<markable id="markable_{}" span="word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, current_word_id, set_id))
		markable_id += 1
		set_id += 1

		last_pos = cols[4]
		last_word = cols[3]

		if closed_brackets > 0:
		del stack[-1*closed_brackets:]


		coref_anno = cols[-1]

		for coref in coref_anno.split("\|"):
		start = False
		end = False
		buffr = ""
		for i, char in enumerate(coref):
		if char == "(":
		start = True
		elif char == ")":
		end = True
		elif char in "0123456789":
		buffr += char
		if start and end:
		coref_level.write('<markable id="markable_{}" span="word_{}" agreement="none" np_form="none" coref_class="set_{}" mmax_level="coref" semantic_class="none" type="none" grammatical_role="none" />'.format(markable_id, current_word_id, buffr))
		markable_id += 1
		elif start:
		if buffr not in coref_stack:
		coref_stack[buffr] = [current_word_id]
		else:
		coref_stack[buffr].append(current_word_id)
		elif end:
		coref_level.write('<markable id="markable_{}" span="word_{}..word_{}" agreement="none" np_form="none" coref_class="set_{}" mmax_level="coref" semantic_class="none" type="none" grammatical_role="none" />'.format(markable_id, coref_stack[buffr][-1], current_word_id, buffr))
		markable_id += 1

		current_word_id += 1

		xml.write("</words>")
		xml.close()
		comp_level.write("</markables>")
		comp_level.close()
		coref_level.write("</markables>")
		coref_level.close()

		print(jjr_count, rbr_count, jj_count)
		print(c)

		if __name__ == "__main__":
		format()