Loading source/extraction.py +76 −68 Original line number Diff line number Diff line Loading @@ -2,10 +2,12 @@ from bs4 import BeautifulSoup import os import config import re import spacy import numpy as np from model import Mention, Entity nlp = spacy.load("en_core_web_lg") def from_isnotes(path): Loading Loading @@ -66,31 +68,84 @@ def from_isnotes(path): return mentions def from_conll(path): #pass conll = open(path, 'r') mentions = list() text = list() poss = list() conll = open(path, 'r') # source in tsv format mentions = [] # return list of Mention objects word_id = 0 mention_id = 0 coref_id = 0 comp_count = 0 ment_count = 0 stack = list() coref_stack = dict() for line in conll: if line.startswith("#begin document"): pass sentence_id = 0 coref_spans = dict() # coref spans are not cross document text = [] # new document also means new sentence tags = [] # and new pos tags coref_stack = {} mention_id = 0 elif line.startswith("#end document"): pass elif line == "\n": stack = list() sentence_id = 0 #doc = nlp(" ".join(text)) doc = nlp.tokenizer.tokens_from_list(text) nlp.tagger(doc) nlp.parser(doc) for chunk in doc.noun_chunks: tokens = chunk.text.split(" ") left_edge = word_id + chunk.root.left_edge.i right_edge = word_id + chunk.root.right_edge.i if (left_edge, right_edge) in coref_spans: coref_set = coref_spans[(left_edge, right_edge)] else: coref_set = None if chunk.root.head.text.lower() == "others": comp_from = True else: comp_from = False for token in chunk: if token.dep_ == "poss" and token.head == chunk.root: left_poss = word_id + token.left_edge.i right_poss = word_id + token.right_edge.i if (left_poss, right_poss) in coref_spans: poss_coref = coref_spans[(left_poss, right_poss)] else: poss_coref = None new_mention = Mention([child.text for child in token.children], [left_poss, right_poss], mention_id, poss_coref, False) mentions.append(new_mention) mention_id+=1 if token.text.lower() == "than": comp_from = False break elif tags[token.i] == "JJR": comp_from = True elif token.text.lower() == "more" and token.head.pos_ == "ADJ": comp_from = True elif token.text.lower() == "less" and token.head.pos_ == "ADJ": comp_from = True elif token.text.lower() in config.COMPARATIVES and token.head == chunk.root: comp_from = True new_mention = Mention(tokens, [left_edge, right_edge], mention_id, coref_set, comp_from) mentions.append(new_mention) mention_id += 1 word_id += len(text) text = [] # linebreak means new sentence tags = [] coref_stack = {} else: naked_line = line.strip() split_line = naked_line.split() Loading @@ -100,64 +155,17 @@ def from_conll(path): coref = split_line[-1].split("|") coref_stack = {**coref_stack, **{int(c): word_id for c in re.findall("\(([0-9]+)", split_line[-1])}} coref_stack = {**coref_stack, **{int(c): word_id+sentence_id for c in re.findall("\(([0-9]+)", split_line[-1])}} coref_spans = dict() for c in re.findall("([0-9]+)\)", split_line[-1]): if int(c) not in coref_stack: raise Exception coref_spans[(coref_stack[int(c)], word_id)] = int(c) syntax = split_line[5].split("*") opening = [str(f) for f in re.findall("\(([A-Z]+)",syntax[0])] for paren in opening: stack.append((word_id, paren, split_line[3], split_line[4])) coref_spans[(coref_stack[int(c)], word_id+sentence_id)] = int(c) text.append(split_line[3]) poss.append(split_line[4]) closing = [str(f) for f in re.finditer("\)",syntax[1])] for paren in closing: start = stack[-1] if start[1] == "NP" or start[1] == "NML" or start[1] == "NX" or start[1] == "NAC": ment_count += 1 rules = list() rules.append(start[3]=="JJR") rules.append(text[start[0]].lower() == "more" and len(poss) > start[0]+1 and poss[start[0]+1] == "JJ") rules.append(text[start[0]].lower() in config.COMPARATIVES) rules.append(text[start[0]].lower() == "others") if any(rules) and not "than" in text[start[0]:word_id+1]: comp_count += 1 comp_from = "undefined" else: comp_from = None tags.append(split_line[4]) if (start[0],word_id) in coref_spans: coref_set = coref_spans[(start[0],word_id)] else: coref_set = None mentions.append(Mention(text[start[0]:word_id+1], [start[0], word_id], mention_id, coref_set, comp_from)) stack = stack[:-1] mention_id += 1 word_id += 1 sentence_id += 1 return mentions Loading Loading
source/extraction.py +76 −68 Original line number Diff line number Diff line Loading @@ -2,10 +2,12 @@ from bs4 import BeautifulSoup import os import config import re import spacy import numpy as np from model import Mention, Entity nlp = spacy.load("en_core_web_lg") def from_isnotes(path): Loading Loading @@ -66,31 +68,84 @@ def from_isnotes(path): return mentions def from_conll(path): #pass conll = open(path, 'r') mentions = list() text = list() poss = list() conll = open(path, 'r') # source in tsv format mentions = [] # return list of Mention objects word_id = 0 mention_id = 0 coref_id = 0 comp_count = 0 ment_count = 0 stack = list() coref_stack = dict() for line in conll: if line.startswith("#begin document"): pass sentence_id = 0 coref_spans = dict() # coref spans are not cross document text = [] # new document also means new sentence tags = [] # and new pos tags coref_stack = {} mention_id = 0 elif line.startswith("#end document"): pass elif line == "\n": stack = list() sentence_id = 0 #doc = nlp(" ".join(text)) doc = nlp.tokenizer.tokens_from_list(text) nlp.tagger(doc) nlp.parser(doc) for chunk in doc.noun_chunks: tokens = chunk.text.split(" ") left_edge = word_id + chunk.root.left_edge.i right_edge = word_id + chunk.root.right_edge.i if (left_edge, right_edge) in coref_spans: coref_set = coref_spans[(left_edge, right_edge)] else: coref_set = None if chunk.root.head.text.lower() == "others": comp_from = True else: comp_from = False for token in chunk: if token.dep_ == "poss" and token.head == chunk.root: left_poss = word_id + token.left_edge.i right_poss = word_id + token.right_edge.i if (left_poss, right_poss) in coref_spans: poss_coref = coref_spans[(left_poss, right_poss)] else: poss_coref = None new_mention = Mention([child.text for child in token.children], [left_poss, right_poss], mention_id, poss_coref, False) mentions.append(new_mention) mention_id+=1 if token.text.lower() == "than": comp_from = False break elif tags[token.i] == "JJR": comp_from = True elif token.text.lower() == "more" and token.head.pos_ == "ADJ": comp_from = True elif token.text.lower() == "less" and token.head.pos_ == "ADJ": comp_from = True elif token.text.lower() in config.COMPARATIVES and token.head == chunk.root: comp_from = True new_mention = Mention(tokens, [left_edge, right_edge], mention_id, coref_set, comp_from) mentions.append(new_mention) mention_id += 1 word_id += len(text) text = [] # linebreak means new sentence tags = [] coref_stack = {} else: naked_line = line.strip() split_line = naked_line.split() Loading @@ -100,64 +155,17 @@ def from_conll(path): coref = split_line[-1].split("|") coref_stack = {**coref_stack, **{int(c): word_id for c in re.findall("\(([0-9]+)", split_line[-1])}} coref_stack = {**coref_stack, **{int(c): word_id+sentence_id for c in re.findall("\(([0-9]+)", split_line[-1])}} coref_spans = dict() for c in re.findall("([0-9]+)\)", split_line[-1]): if int(c) not in coref_stack: raise Exception coref_spans[(coref_stack[int(c)], word_id)] = int(c) syntax = split_line[5].split("*") opening = [str(f) for f in re.findall("\(([A-Z]+)",syntax[0])] for paren in opening: stack.append((word_id, paren, split_line[3], split_line[4])) coref_spans[(coref_stack[int(c)], word_id+sentence_id)] = int(c) text.append(split_line[3]) poss.append(split_line[4]) closing = [str(f) for f in re.finditer("\)",syntax[1])] for paren in closing: start = stack[-1] if start[1] == "NP" or start[1] == "NML" or start[1] == "NX" or start[1] == "NAC": ment_count += 1 rules = list() rules.append(start[3]=="JJR") rules.append(text[start[0]].lower() == "more" and len(poss) > start[0]+1 and poss[start[0]+1] == "JJ") rules.append(text[start[0]].lower() in config.COMPARATIVES) rules.append(text[start[0]].lower() == "others") if any(rules) and not "than" in text[start[0]:word_id+1]: comp_count += 1 comp_from = "undefined" else: comp_from = None tags.append(split_line[4]) if (start[0],word_id) in coref_spans: coref_set = coref_spans[(start[0],word_id)] else: coref_set = None mentions.append(Mention(text[start[0]:word_id+1], [start[0], word_id], mention_id, coref_set, comp_from)) stack = stack[:-1] mention_id += 1 word_id += 1 sentence_id += 1 return mentions Loading