Commit 0dc96118 authored by axtimhaus's avatar axtimhaus
Browse files

Update extraction algorithm

parent e0cd4bea
Loading
Loading
Loading
Loading
+125 −19
Original line number Diff line number Diff line
@@ -16,7 +16,7 @@ def from_isnotes(path):
    
    if file_name.endswith(".mmax"):
        
        print(file_name)
        #print(file_name)
        
        file_root = file_name[:-5]
        
@@ -81,15 +81,14 @@ def from_conll(path):
    comp_count = 0
    ment_count = 0
    
    for line in conll:
        if line.startswith("#begin document"):
    stack = list()
    coref_stack = dict()
    
    for line in conll:
        if line.startswith("#begin document"):
            pass
        elif line.startswith("#end document"):
            if comp_count > 0:
                for m in mentions:
                    print(m)
            return (ment_count, comp_count, mentions)
            pass
        elif line == "\n":
            stack = list()
        else:
@@ -130,6 +129,7 @@ def from_conll(path):
                    rules = list()
                    
                    rules.append(start[3]=="JJR")
                    
                    rules.append(text[start[0]].lower() == "more" 
                                 and len(poss) > start[0]+1 
                                 and poss[start[0]+1] == "JJ")
@@ -159,7 +159,121 @@ def from_conll(path):
            
            word_id += 1
            
    return (ment_count, comp_count, mentions)
    return mentions
            
            
def to_mmax(source_path, target_directory_path):
    """ Source must be in conll format
    
    """
    
    name = os.path.splitext(os.path.basename(source_path))
    base_name = name[0]
    
    print("\n[NCR] Extracting entities from \033[1m{}\033[0m.".format("".join(name)))
    
    mentions = from_conll(source_path)
    
    comp_from_list = [m.comp_from for m in mentions]
    
    len_mentions = sum(bool(c) for c in comp_from_list)
        
    
    new_name = "{:0>2}_{}".format(len_mentions, base_name)
    
    if any(comp_from_list):
        
        mmax = open(os.path.join(target_directory_path,  new_name+".mmax"), "w")
        mmax.write("""<?xml version="1.0"?>
<mmax_project>
<!--<sentences>002_htc_text.xml</sentences>-->
<turns></turns>
<words>{}</words>
<gestures></gestures>
<keyactions></keyactions>
</mmax_project>
""".format(new_name+"_words.xml"))
        mmax.close()
        
        target_path = os.path.join(target_directory_path, "Basedata", new_name+"_words.xml")
        
        source = open(source_path, 'r')
        target = open(target_path, 'w')
        
        word_id = 0
        
        
        target.write(r"""<?xml version='1.0' encoding='UTF-8'?>
<!DOCTYPE words SYSTEM "words.dtd">
<words>""")
        
        for line in source:
            if line.startswith("#begin document"):
                pass
            elif line.startswith("#end document"):
                pass
            elif line == "\n":
                pass
            else:
                naked_line = line.strip()
                split_line = naked_line.split()
                
                token = split_line[3]
                if token == "&":
                    token = r"&amp;"
                token = re.sub("<", "&lt;", token)
                token = re.sub(">", "&gt;", token)
                
                target.write(r'<word id="word_'+str(word_id)+r'">'+token+r'</word>'+'\n')
                
                word_id += 1
        
        target.write(r"</words>")
        
        print("[NCR] Writing \033[92m{:>4}\033[0m token(s) to \033[1m{}_words.xml\033[0m.".format(word_id, new_name))
        
        markable_id = 0

        entity_level_path = os.path.join(target_directory_path+"Markables", new_name+"_entity_level.xml")
        entity_level = open(entity_level_path, 'w')
        entity_level.write(r"""<?xml version='1.0'?>
<!DOCTYPE markables SYSTEM "markables.dtd">
<markables xmlns="www.comp.leeds.ac.uk/markert/entity">""")
        
        coref_level_path = os.path.join(target_directory_path+"Markables", new_name+"_coref_level.xml")
        coref_level = open(coref_level_path, 'w')
        coref_level.write(r"""<?xml version='1.0'?>
<!DOCTYPE markables SYSTEM "markables.dtd">
<markables xmlns="www.eml.org/NameSpaces/coref">""")
        
        coref_chains = set()
        
        for mention in mentions:
            if mention.comp_from:
                entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" information_status="mediated" mediated_type="comparative" comparative_type="withintext" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n')
                markable_id += 1
            else:
                entity_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" mmax_level="entity" />'+'\n')
                markable_id += 1
            
            if mention.coref_set:
                coref_chains.add(mention.coref_set)
                coref_level.write(r'<markable id="markable_'+str(markable_id)+r'" span="word_'+str(mention.span[0])+r'..word_'+str(mention.span[1])+r'" coref_set="set_'+str(mention.coref_set)+r'" mmax_level="coref" />'+'\n')
                markable_id += 1
            else:
                pass
        
        entity_level.write(r'</markables>')
        coref_level.write(r'</markables>')
        
        entity_level.close()
        coref_level.close()
    
        print("[NCR] Writing \033[92m{:>4}\033[0m mention(s) to \033[1m{}_entity_level.xml\033[0m.".format(len_mentions, new_name))
        print("[NCR] Writing \033[92m{:>4}\033[0m coreference chain(s) to \033[1m{}_coref_level.xml\033[0m.".format(len(coref_chains), new_name))
        
    else:
        print("[NCR] \033[1m{}\033[96m does not contain any comparatives.\033[0m".format(base_name))
        
    
if __name__ == "__main__":
@@ -176,19 +290,11 @@ if __name__ == "__main__":
    files = []
    for x in os.walk(root):
        for y in x[2]:
            print(y)
            #print(y)
            if y.endswith(".v4_gold_conll"):
                files.append(x[0]+"/"+y)
    
    mcount = 0
    ccount = 0
    dist = np.zeros(26)
    for f in files:
        plus_m, plus_c, mentions = from_conll(f)
        if plus_c > 0:
            dist[plus_c] += 1
            mcount += plus_m
            ccount += plus_c
    print(mcount, ccount)
    print(dist[1:])
        to_mmax(f, "/home/students/zimmermann/Projects/ncr/annotation/Entity/")
    
    
+1 −0
Original line number Diff line number Diff line
@@ -20,3 +20,4 @@ class Mention:
            return '\033[92m'+plain_repr+'\033[0m'
        else:
            return plain_repr