Commit 8c25f2d7 authored by axtimhaus's avatar axtimhaus
Browse files

Major update to annotation and preprocessing.

parent e1613144
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -18,7 +18,6 @@ except:
    model = kv.load_word2vec_format(fname, binary=False)
print("Done.         ")


def next_mention(mentions, anaphora):
    sims = [1/(anaphora.span[0] - mention.span[1]) if mention.span[1] < anaphora.span[0] else 0 for mention in mentions]
    return mentions[np.argmax(sims)]
@@ -57,9 +56,9 @@ if __name__ == "__main__":
    for f in files[:]:
        
        print("Fetching test data...")
        mentions = extraction.get_mentions(root+f)
        mentions = extraction.from_isnotes(root+f)
        
        comparatives = [m for m in mentions if m.comp_from]
        comparatives = [m for m in mentions if m.comp_from and m.comp_from != "outsidetext"]
        
        entities = []
        entity_dict = {}
+1 −0
Original line number Diff line number Diff line
EMBEDDINGS_PATH = "/softpro/ss18/kernseife/kernseife/data/en/embeddings/glove.twitter.27B.100d.txt"
DATA_PATH = "/proj/zimmermann/isnotes/ISClean/"
CONLL_PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
CORPUS_PATH = "/home/students/zimmermann/Projects/ncr/corpus/"

COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]
+350 −13
Original line number Diff line number Diff line
@@ -3,11 +3,14 @@ import os
import config
import re
import spacy
import en_core_web_sm
import numpy as np

from model import Mention, Entity
from tqdm import tqdm

nlp = spacy.load("en_core_web_lg")
nlp = en_core_web_sm.load()
#nlp = spacy.load("en")

def from_isnotes(path):
    
@@ -26,6 +29,17 @@ def from_isnotes(path):
        entities = BeautifulSoup(open(root+"/markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml")
        coref = BeautifulSoup(open(root+"/markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml")
        
        spans = dict()
        word_dict = dict()
        coref_dict = {None:[]}
        
        corpus = open(config.CORPUS_PATH+file_root+".ncr","w")
        
        for i, word in enumerate(basedata.contents[2].contents[0].contents[0].find_all()):
            corpus.write(str(i)+"\t"+word.contents[0]+"\n")
        
        corpus.write("\n")
        
        for entity in entities.contents[2].contents[0].contents[0].find_all():
            
            span = entity["span"]
@@ -45,6 +59,8 @@ def from_isnotes(path):
            
            try:
                coref_set = coref.find("markable", {"span": span})["coref_set"]
                if coref_set not in coref_dict:
                    coref_dict[coref_set] = list()
            except:
                coref_set = None
            
@@ -57,6 +73,12 @@ def from_isnotes(path):
                
                mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from))
                
                spans[mention_id] = [span_ids[0]-1,span_ids[1]-1]
                if coref_set in coref_dict:
                    coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1])
                else:
                    coref_dict[None].append([span_ids[0]-1,span_ids[1]-1])
                
            else:
                
                token = basedata.find("word", {"id": span}).contents[0]
@@ -65,6 +87,35 @@ def from_isnotes(path):
                word_id = int(span.split("_")[-1])
                mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from))
                
                spans[mention_id] = [word_id-1, word_id-1]
                if coref_set in coref_dict:
                    coref_dict[coref_set].append([word_id-1, word_id-1])
                else:
                    coref_dict[None].append([word_id-1, word_id-1])
            
        for m in mentions:
            if m.comp_from:
                if m.comp_from == "outsidetext":
                    corpline = spans[m.mention_id]
                    corpus.write("\t".join([str(c) for c in corpline])+"\n")
                else:
                    if m.comp_from.startswith("markable"):
                        corpline = spans[m.mention_id]
                        cfroms = m.comp_from.split(";")
                        for cfrom in cfroms:
                            corpline += spans[cfrom]
                        corpus.write("\t".join([str(c) for c in corpline])+"\n")
        
        corpus.write("\n")
        
        
        for c in coref_dict.items():
            if c[0] != None:
                corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n")
        for c in coref_dict[None]:
            corpus.write("\t".join([str(x) for x in c])+"\n")
        
    
    return mentions

def from_conll(path):
@@ -90,8 +141,9 @@ def from_conll(path):
            sentence_id = 0
            #doc = nlp(" ".join(text)) 
            doc = nlp.tokenizer.tokens_from_list(text)
            nlp.tagger(doc)
            nlp.parser(doc)
            for name, proc in nlp.pipeline:
                doc=proc(doc)
            #doc = spacy.tokens.doc.Doc(nlp.vocab, words=text, spaces=[True]*len(text))
            
            for chunk in doc.noun_chunks:
                tokens = chunk.text.split(" ")
@@ -284,6 +336,273 @@ def to_mmax(source_path, target_directory_path):
        print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name))


def mark_comparatives(source_path, target_directory_path):
    """ Source must be in conll format
    
    """
    
    name = os.path.splitext(os.path.basename(source_path))
    base_name = name[0]
    
    corpus = open(config.CORPUS_PATH+basename+".txt","w")
    all_tokens = list()
    
    print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name)))
    
    mentions = from_conll(source_path)
    
    comp_from_list = [m.comp_from for m in mentions]
    
    len_mentions = sum(bool(c) for c in comp_from_list)
        
    new_name = "{:0>2}_{}".format(len_mentions, base_name)
    
    if any(comp_from_list):
        
        mmax = open(os.path.join(target_directory_path,  new_name+".mmax"), "w")
        mmax.write("""<?xml version="1.0"?>
<mmax_project>
<!--<sentences>002_htc_text.xml</sentences>-->
<turns></turns>
<words>{}</words>
<gestures></gestures>
<keyactions></keyactions>
</mmax_project>
""".format(new_name+"_words.xml"))
        mmax.close()
        
        target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml")
        
        source = open(source_path, 'r')
        target = open(target_path, 'w')
        
        word_id = 0
        
        
        target.write(r"""<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE words SYSTEM "words.dtd">
<words>""")
        
        for line in source:
            if line.startswith("#begin document"):
                pass
            elif line.startswith("#end document"):
                pass
            elif line == "\n":
                corpus.write(" ".join(all_tokens)+"\n")
            else:
                naked_line = line.strip()
                split_line = naked_line.split()
                
                token = split_line[3]
                if token == "&":
                    token = r"&amp;"
                token = re.sub("<", "&lt;", token)
                token = re.sub(">", "&gt;", token)
                
                target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n')
                
                word_id += 1
        
        target.write(r"</words>")
        
        print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name))
        
        markable_id = 0

        entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml")
        entity_level = open(entity_level_path, 'w')
        entity_level.write(r"""<?xml version='1.0'?>
<!DOCTYPE markables SYSTEM "markables.dtd">
<markables xmlns="www.comp.leeds.ac.uk/markert/entity">""")
        
        coref_chains = set()
        
        for mention in mentions:
            if mention.comp_from:
                entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n')
                markable_id += 1
            else:
                pass
        
        entity_level.write(r'</markables>')
        
        entity_level.close()
    
        print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name))
        
    else:
        print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name))
        

def merge_annotation(entity_file, annotation_file):
    
    path = entity_file
    
    split_path = path.split(r"/")
    root, file_name = "/".join(split_path[:-1]), split_path[-1]
    
    processed = []
    for path, subdirs, files in os.walk(config.CORPUS_PATH):
        for name in files:
            processed.append(name)
        
    mentions = []
    file_root = file_name[:-5]
    new_root = "_".join(file_name[:-5].split("_")[1:])
    
    
    if file_name.endswith(".mmax") and new_root+".ncr" not in processed:
        basedata = BeautifulSoup(open(root+"/Basedata/"+file_root+"_words.xml", "r").read(), features="lxml")
        entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml")
        coref = BeautifulSoup(open(root+"/Markables/"+file_root+"_coref_level.xml", "r").read(), features="lxml")
        
        spans = dict()
        word_dict = dict()
        coref_dict = {None:[]}
        span_set = set()
        match_count = 0
        total_count = 0
        
        bd = basedata.contents[2].contents[0].contents[0].find_all()
        
        for entity in tqdm(entities.contents[2].contents[0].contents[0].find_all()):
            
            span = entity["span"]
            tokens = []
            mention_id = entity["id"]
            
            # comparative: either "outsidetext" or antecedent id
            comp_from = None
            
            try:
                coref_set = coref.find("markable", {"span": span})["coref_set"]
                if coref_set not in coref_dict:
                    coref_dict[coref_set] = list()
            except:
                coref_set = None
            
            if ".." in span:
                    
                split_span = span.split("..")
                span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]]
                for i in range(span_ids[0],span_ids[1]+1):
                    token = basedata.find("word", {"id": "word_"+str(i)}).contents[0]
                    tokens.append(token)
                
                mentions.append(Mention( tokens, [span_ids[0],span_ids[1]], mention_id, coref_set, comp_from))
                
                spans[mention_id] = [span_ids[0]-1,span_ids[1]-1]
                span_set.add((span_ids[0]-1,span_ids[1]-1))
                if coref_set in coref_dict:
                    coref_dict[coref_set].append([span_ids[0]-1,span_ids[1]-1])
                else:
                    coref_dict[None].append([span_ids[0]-1,span_ids[1]-1])
                
            else:
                
                token = basedata.find("word", {"id": span}).contents[0]
                tokens.append(token)
                
                word_id = int(span.split("_")[-1])
                mentions.append(Mention(tokens, [word_id, word_id], mention_id, coref_set, comp_from))
                
                spans[mention_id] = [word_id-1, word_id-1]
                span_set.add((word_id-1, word_id-1))
                if coref_set in coref_dict:
                    coref_dict[coref_set].append([word_id-1, word_id-1])
                else:
                    coref_dict[None].append([word_id-1, word_id-1])

        path = annotation_file
        
        split_path = path.split(r"/")
        root, file_name = "/".join(split_path[:-1]), split_path[-1]
        file_root = file_name[:-5]
        new_root = "_".join(file_name[:-5].split("_")[1:])
        new_spans = dict()
        
        entities = BeautifulSoup(open(root+"/Markables/"+file_root+"_entity_level.xml", "r").read(), features="lxml")
        word_dict = dict()
            
        mention_ids = -1
        for entity in entities.contents[2].contents[0].contents[0].find_all():
                
            span = entity["span"]
            tokens = []
            mention_id = entity["id"]
                
            comp_from = None
            if entity["information_status"] == "mediated":
                try:
                    if entity["mediated_type"] == "comparative":
                        if entity["comparative_type"] == "withintext" and entity["comp_from"] != None:
                            comp_from = entity["comp_from"]
                        elif entity["comparative_type"] == "outsidetext":
                            comp_from = "outsidetext"
                except:
                    pass
                
                
            if ".." in span:
                    
                split_span = span.split("..")
                span_ids = [int(re.sub("word_", "", word)) for word in [split_span[0], split_span[-1]]]
                    
                mentions.append(Mention(None, [span_ids[0],span_ids[1]], mention_ids, None, comp_from))
                    
                new_spans[mention_id] = [span_ids[0]-1,span_ids[1]-1]
                spans[mention_ids] = [span_ids[0]-1,span_ids[1]-1]
                test_span = [span_ids[0]-1,span_ids[1]-1]
                mention_ids -= 1
                    
            else:
                    
                word_id = int(span.split("_")[-1])
                mentions.append(Mention(None, [word_id, word_id], mention_ids, None, comp_from))
                    
                new_spans[mention_id] = [word_id-1, word_id-1]
                spans[mention_ids] = [word_id-1, word_id-1]
                test_span = [word_id-1, word_id-1]
                mention_ids -= 1
            
            if not comp_from:
                total_count += 1
                if (test_span[0],test_span[1]) in span_set:
                    match_count += 1
        
        comp_froms = [m.comp_from for m in mentions]
        if any(comp_froms):
            corpus = open(config.CORPUS_PATH+new_root+".ncr","w")
            for i, word in enumerate(bd):
                corpus.write(str(i)+"\t"+word.contents[0]+"\n")
            corpus.write("\n")
            
            for m in mentions:
                if m.comp_from:
                    if m.comp_from == "outsidetext":
                        corpline = spans[m.mention_id]
                        corpus.write("\t".join([str(c) for c in corpline])+"\n")
                    else:
                        if m.comp_from.startswith("markable"):
                            corpline = spans[m.mention_id]
                            cfroms = m.comp_from.split(";")
                            for cfrom in cfroms:
                                corpline += new_spans[cfrom]
                            corpus.write("\t".join([str(c) for c in corpline])+"\n")
            
            corpus.write("\n")
                
            for c in coref_dict.items():
                if c[0] != None:
                    corpus.write("\t".join([str(x) for v in c[1] for x in v])+"\n")
            for c in coref_dict[None]:
                corpus.write("\t".join([str(x) for x in c])+"\n")
        
            return match_count, total_count
    return 0, 0
    
    
if __name__ == "__main__":
    # ISnotes
    #root = config.DATA_PATH
@@ -295,14 +614,32 @@ if __name__ == "__main__":
    # conll/OntoNotes
    root = config.CONLL_PATH
    
    files = []
    for x in os.walk(root):
        for y in x[2]:
            #print(y)
            if y.endswith(".v4_gold_conll"):
                files.append(x[0]+"/"+y)
    #files = []
    #for x in os.walk(root):
        #for y in x[2]:
            ##print(y)
            #if y.endswith(".v4_gold_conll"):
                #files.append(x[0]+"/"+y)
    
    #for f in files:
        #to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")
    
    other_files = os.listdir("/home/students/zimmermann/Projects/ncr/annotation/Entity/")
    
    root = "/home/students/zimmermann/Projects/ncr/annotation/Freestyle/"
    
    files = os.listdir(root)
    
    mc, tc = 0,0 
    
    for f in files:
        to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")
    for f in sorted(files)[::-1]:
        if f.endswith(".mmax"):
            split_path = f.split(r"/")
            _, file_name = "/".join(split_path[:-1]), split_path[-1]
            print(f)
            m,t = merge_annotation("/home/students/zimmermann/Projects/ncr/annotation/Entity/"+[s for s in other_files if s.endswith("_".join(f.split("_")[1:]))][0], root+f)
            mc += m
            tc += t
            
            print(mc, tc, 0 if tc==0 else mc/tc)
        
+184 −0
Original line number Diff line number Diff line
import os
import argparse
import config
import spacy
import en_core_web_sm
import gensim
import numpy as np

from gensim.models import word2vec
from gensim.models import KeyedVectors as kv
    
nlp = en_core_web_sm.load()

fname = config.EMBEDDINGS_PATH
print("Loading model.", end="\r")
try:
    model = kv.load(fname)
except: 
    model = kv.load_word2vec_format(fname, binary=False)
print("Done.         ")

def closest_preceding_mention(toks, ents, ana):
    best_ment = (0,0)
    for ent in ents:
        for ment in ent:
            if ana[0] - ment[1] <  ana[0] - best_ment[1] and ment[1] < ana[0]:
                best_ment = ment
    return {best_ment}

def head_match(toks, ents, ana, window=20):
    best_ment = (0,0)
    
    ana_toks = toks[ana[0]:ana[1]+1]
    ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks]
    
    doc = nlp.tokenizer.tokens_from_list(ana_toks)
    for name, proc in nlp.pipeline:
        doc=proc(doc)
        
    ana_head = [token for token in doc if token.head == token][0].lemma_
    
    for ent in ents:
        for ment in ent:
            ment_toks = toks[ment[0]:ment[1]+1]
            ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks]
            
            doc = nlp.tokenizer.tokens_from_list(ment_toks)
            for name, proc in nlp.pipeline:
                doc=proc(doc)
                
            try:
                ment_head = [token for token in doc if token.head == token][0].lemma_
            except IndexError:
                continue
            
            if ana_head == ment_head:
                if ana[0] - ment[1] <  ana[0] - best_ment[1] and ment[1] < ana[0] and ana[0] - ment[1] < window:
                    best_ment = ment
    
    if best_ment == (0,0):
        return closest_preceding_mention(toks, ents, ana)
    else:
        return {best_ment}
                
def highest_embedding_similarity(toks, ents, ana):
    best_ment = (0,0)
    best_sim = 0
    
    ana_toks = toks[ana[0]:ana[1]+1]
    ana_toks = ["unlike" if tok=="like" else tok for tok in ana_toks]
    
    doc = nlp.tokenizer.tokens_from_list(ana_toks)
    for name, proc in nlp.pipeline:
        doc=proc(doc)
        
    ana_head = [token for token in doc if token.head == token][0].text
    
    for ent in ents:
        for ment in ent:
            ment_toks = toks[ment[0]:ment[1]+1]
            ment_toks = ["unlike" if tok=="like" else tok for tok in ment_toks]
            
            doc = nlp.tokenizer.tokens_from_list(ment_toks)
            for name, proc in nlp.pipeline:
                doc=proc(doc)
                
            try:
                ment_head = [token for token in doc if token.head == token][0].text
            except IndexError:
                continue
            
            sim = 1/(model.wv.wmdistance(ment_head, ana_head)+1)
            
            if best_sim <= sim and ment[1] < ana[0]:
                print(ment_head, sim, ana_head)
                best_ment = ment
                best_sim = sim
    
    if best_ment == (0,0):
        return closest_preceding_mention(toks, ents, ana)
    else:
        return {best_ment}
    

def most_salient_entity(toks, ents, ana):
    best_ent = {(0,0)}
    max_ent = 0
    for ent in ents:
        if len(ent) >= max_ent and any([ment[1] < ana[0] for ment in ent]):
            max_ent = len(ent)
            best_ent = ent
    return best_ent

def largest_entity_span(toks, ents, ana):
    best_ent = {(0,0)}
    max_span = 0
    for ent in ents:
        if ent:
            start = min([x for x,y in ent])
            end = max([y for x,y in ent])
            span = end-start
            if span >= max_span and any([ment[1] < ana[0] for ment in ent]):
                max_span = span
                best_ent = ent
                
    if best_ment == {(0,0)}:
        return closest_preceding_mention(toks, ents, ana)
    else:
        return {best_ment}

def import_doc(path):
    doc = open(path, 'r').read()
    
    tokens, comps, coref = doc.split("\n\n")
    tokens = [line.strip().split()[1] for line in tokens.split("\n")]
    anaphora = dict()
    for line in comps.split("\n"):
        line = [int(l) for l in line.strip().split()]
        if len(line) > 2:
            anaphora[(line[0],line[1])] = {(line[i],line[i+1]) for i in range(2,len(line),2)}
        else:
            anaphora[(line[0],line[1])] = set()
    entities = list()
    for line in coref.split("\n"):
        line = [int(l) for l in line.strip().split()]
        entities.append([(line[i],line[i+1]) for i in range(0,len(line),2)])
        
    return tokens, anaphora, entities


    

if __name__ == "__main__":
    
    root = "/home/students/zimmermann/Projects/ncr/corpus/"
    files = os.listdir(root)
    
    baseline = head_match
    
    correct = 0
    incorrect = 0
    
    for f in files:
        path = root+f
        print(path)
        toks, anas, ents = import_doc(path)
        
        for ana in anas.keys():
            if anas[ana]:
                #ents = set.union(*anas.values(), *ents)
                system = baseline(toks, ents, ana)
                #if anas[ana] == system:
                if anas[ana] & set(system) != set():
                    correct += 1
                else:
                    incorrect += 1
            
        print(correct, incorrect, correct/(correct+incorrect))
        
    
    
        
        
        
+77 −0

File added.

Preview size limit exceeded, changes collapsed.