Commit e1613144 authored by axtimhaus's avatar axtimhaus
Browse files

Update extraction.py with noun chunking for entity extraction

parent 1b3336a7
Loading
Loading
Loading
Loading
+76 −68
Original line number Diff line number Diff line
@@ -2,10 +2,12 @@ from bs4 import BeautifulSoup
import os
import config
import re
import spacy
import numpy as np

from model import Mention, Entity

nlp = spacy.load("en_core_web_lg")

def from_isnotes(path):
    
@@ -66,31 +68,84 @@ def from_isnotes(path):
    return mentions

def from_conll(path):
    #pass
    conll = open(path, 'r')
    
    mentions = list()
    
    text = list()
    poss = list()
    
    conll = open(path, 'r') # source in tsv format
    mentions = [] # return list of Mention objects
    word_id = 0
    mention_id = 0
    coref_id = 0
    
    comp_count = 0
    ment_count = 0
    
    stack = list()
    coref_stack = dict()
    
    for line in conll:
        if line.startswith("#begin document"):
            pass
            sentence_id = 0
            coref_spans = dict() # coref spans are not cross document
            text = [] # new document also means new sentence
            tags = [] # and new pos tags
            coref_stack = {}
            
            mention_id = 0
            
        elif line.startswith("#end document"):
            pass
        
        elif line == "\n":
            stack = list()
            sentence_id = 0
            #doc = nlp(" ".join(text)) 
            doc = nlp.tokenizer.tokens_from_list(text)
            nlp.tagger(doc)
            nlp.parser(doc)
            
            for chunk in doc.noun_chunks:
                tokens = chunk.text.split(" ")
                left_edge = word_id + chunk.root.left_edge.i
                right_edge = word_id + chunk.root.right_edge.i
                
                if (left_edge, right_edge) in coref_spans:
                    coref_set = coref_spans[(left_edge, right_edge)]
                else:
                    coref_set = None
                
                if chunk.root.head.text.lower() == "others":
                        comp_from = True
                else: 
                        comp_from = False
                        
                for token in chunk:
                    
                    if token.dep_ == "poss" and token.head == chunk.root:
                        left_poss = word_id + token.left_edge.i
                        right_poss = word_id + token.right_edge.i
                
                        if (left_poss, right_poss) in coref_spans:
                            poss_coref = coref_spans[(left_poss, right_poss)]
                        else:
                            poss_coref = None
                        
                        new_mention = Mention([child.text for child in token.children],
                                              [left_poss, right_poss], mention_id, poss_coref, False)
                        mentions.append(new_mention)
                        mention_id+=1
                        
                    if token.text.lower() == "than":
                        comp_from = False
                        break
                    elif tags[token.i] == "JJR":
                        comp_from = True
                    elif token.text.lower() == "more" and token.head.pos_ == "ADJ":
                        comp_from = True
                    elif token.text.lower() == "less" and token.head.pos_ == "ADJ":
                        comp_from = True
                    elif token.text.lower() in config.COMPARATIVES and token.head == chunk.root:
                        comp_from = True
                    
                new_mention = Mention(tokens, [left_edge, right_edge], 
                                      mention_id, coref_set, comp_from)
                mentions.append(new_mention)
                mention_id += 1
                
            word_id += len(text)
            text = [] # linebreak means new sentence
            tags = []
            coref_stack = {}
            
        else:
            naked_line = line.strip()
            split_line = naked_line.split()
@@ -100,64 +155,17 @@ def from_conll(path):
        
            coref = split_line[-1].split("|")
            
            coref_stack = {**coref_stack, **{int(c): word_id for c in re.findall("\(([0-9]+)", split_line[-1])}}
            coref_stack = {**coref_stack, **{int(c): word_id+sentence_id for c in re.findall("\(([0-9]+)", split_line[-1])}}
            
            coref_spans = dict()
            for c in re.findall("([0-9]+)\)", split_line[-1]):
                if int(c) not in coref_stack:
                    raise Exception
                coref_spans[(coref_stack[int(c)], word_id)] = int(c)
            
            syntax = split_line[5].split("*")
            
            opening = [str(f) for f in re.findall("\(([A-Z]+)",syntax[0])]
            for paren in opening:
                stack.append((word_id, paren, split_line[3], split_line[4]))
            
                coref_spans[(coref_stack[int(c)], word_id+sentence_id)] = int(c)
            
            text.append(split_line[3])
            poss.append(split_line[4])
            
            closing = [str(f) for f in re.finditer("\)",syntax[1])]
            
            
            for paren in closing:
                start = stack[-1]
                if start[1] == "NP" or start[1] == "NML" or start[1] == "NX" or start[1] == "NAC":
                    ment_count += 1
                    
                    rules = list()
                    
                    rules.append(start[3]=="JJR")
                    
                    rules.append(text[start[0]].lower() == "more" 
                                 and len(poss) > start[0]+1 
                                 and poss[start[0]+1] == "JJ")
                    rules.append(text[start[0]].lower() in config.COMPARATIVES)
                    rules.append(text[start[0]].lower() == "others")
                    
                    if any(rules) and not "than" in text[start[0]:word_id+1]:
                        comp_count += 1
                        comp_from = "undefined"
                    else:
                        comp_from = None
            tags.append(split_line[4])
            
                    if (start[0],word_id) in coref_spans:
                        coref_set = coref_spans[(start[0],word_id)]
                    else:
                        coref_set = None
                    
                    mentions.append(Mention(text[start[0]:word_id+1], [start[0], word_id], mention_id, coref_set, comp_from))
                    
                stack = stack[:-1]
                
                
                
                
                mention_id += 1
            
            
            word_id += 1
            sentence_id += 1
            
    return mentions