Update extraction.py with noun chunking for entity extraction (e1613144) · Commits · Victor Zimmermann / Neural Resolution of Comparative Anaphora

source/extraction.py

+76 −68

Original line number	Diff line number	Diff line
		@@ -2,10 +2,12 @@ from bs4 import BeautifulSoup
		import os
		import config
		import re
		import spacy
		import numpy as np

		from model import Mention, Entity

		nlp = spacy.load("en_core_web_lg")

		def from_isnotes(path):

		@@ -66,31 +68,84 @@ def from_isnotes(path):
		return mentions

		def from_conll(path):
		#pass
		conll = open(path, 'r')

		mentions = list()

		text = list()
		poss = list()

		conll = open(path, 'r') # source in tsv format
		mentions = [] # return list of Mention objects
		word_id = 0
		mention_id = 0
		coref_id = 0

		comp_count = 0
		ment_count = 0

		stack = list()
		coref_stack = dict()

		for line in conll:
		if line.startswith("#begin document"):
		pass
		sentence_id = 0
		coref_spans = dict() # coref spans are not cross document
		text = [] # new document also means new sentence
		tags = [] # and new pos tags
		coref_stack = {}

		mention_id = 0

		elif line.startswith("#end document"):
		pass

		elif line == "\n":
		stack = list()
		sentence_id = 0
		#doc = nlp(" ".join(text))
		doc = nlp.tokenizer.tokens_from_list(text)
		nlp.tagger(doc)
		nlp.parser(doc)

		for chunk in doc.noun_chunks:
		tokens = chunk.text.split(" ")
		left_edge = word_id + chunk.root.left_edge.i
		right_edge = word_id + chunk.root.right_edge.i

		if (left_edge, right_edge) in coref_spans:
		coref_set = coref_spans[(left_edge, right_edge)]
		else:
		coref_set = None

		if chunk.root.head.text.lower() == "others":
		comp_from = True
		else:
		comp_from = False

		for token in chunk:

		if token.dep_ == "poss" and token.head == chunk.root:
		left_poss = word_id + token.left_edge.i
		right_poss = word_id + token.right_edge.i

		if (left_poss, right_poss) in coref_spans:
		poss_coref = coref_spans[(left_poss, right_poss)]
		else:
		poss_coref = None

		new_mention = Mention([child.text for child in token.children],
		[left_poss, right_poss], mention_id, poss_coref, False)
		mentions.append(new_mention)
		mention_id+=1

		if token.text.lower() == "than":
		comp_from = False
		break
		elif tags[token.i] == "JJR":
		comp_from = True
		elif token.text.lower() == "more" and token.head.pos_ == "ADJ":
		comp_from = True
		elif token.text.lower() == "less" and token.head.pos_ == "ADJ":
		comp_from = True
		elif token.text.lower() in config.COMPARATIVES and token.head == chunk.root:
		comp_from = True

		new_mention = Mention(tokens, [left_edge, right_edge],
		mention_id, coref_set, comp_from)
		mentions.append(new_mention)
		mention_id += 1

		word_id += len(text)
		text = [] # linebreak means new sentence
		tags = []
		coref_stack = {}

		else:
		naked_line = line.strip()
		split_line = naked_line.split()
		@@ -100,64 +155,17 @@ def from_conll(path):

		coref = split_line[-1].split("\|")

		coref_stack = {coref_stack, {int(c): word_id for c in re.findall("\(([0-9]+)", split_line[-1])}}
		coref_stack = {coref_stack, {int(c): word_id+sentence_id for c in re.findall("\(([0-9]+)", split_line[-1])}}

		coref_spans = dict()
		for c in re.findall("([0-9]+)\)", split_line[-1]):
		if int(c) not in coref_stack:
		raise Exception
		coref_spans[(coref_stack[int(c)], word_id)] = int(c)

		syntax = split_line[5].split("*")

		opening = [str(f) for f in re.findall("\(([A-Z]+)",syntax[0])]
		for paren in opening:
		stack.append((word_id, paren, split_line[3], split_line[4]))

		coref_spans[(coref_stack[int(c)], word_id+sentence_id)] = int(c)

		text.append(split_line[3])
		poss.append(split_line[4])

		closing = [str(f) for f in re.finditer("\)",syntax[1])]


		for paren in closing:
		start = stack[-1]
		if start[1] == "NP" or start[1] == "NML" or start[1] == "NX" or start[1] == "NAC":
		ment_count += 1

		rules = list()

		rules.append(start[3]=="JJR")

		rules.append(text[start[0]].lower() == "more"
		and len(poss) > start[0]+1
		and poss[start[0]+1] == "JJ")
		rules.append(text[start[0]].lower() in config.COMPARATIVES)
		rules.append(text[start[0]].lower() == "others")

		if any(rules) and not "than" in text[start[0]:word_id+1]:
		comp_count += 1
		comp_from = "undefined"
		else:
		comp_from = None
		tags.append(split_line[4])

		if (start[0],word_id) in coref_spans:
		coref_set = coref_spans[(start[0],word_id)]
		else:
		coref_set = None

		mentions.append(Mention(text[start[0]:word_id+1], [start[0], word_id], mention_id, coref_set, comp_from))

		stack = stack[:-1]




		mention_id += 1


		word_id += 1
		sentence_id += 1

		return mentions