Commit 3b9b2179 authored by axtimhaus's avatar axtimhaus
Browse files

Update candidate extraction files

parent ff2ef42c
Loading
Loading
Loading
Loading
+51 −13
Original line number Diff line number Diff line
@@ -4,24 +4,34 @@ Fetches all documents including comparative constructions.

import os
import sys
import re

PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/"
COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

def fetch():

def fetch(PATH, TARGET_PATH, qual="gold"):
    jjr_count = 0
    rbr_count = 0
    jj_count = 0
    others_count = 0
    for root, dirs, files in os.walk(PATH):
        for f in files:
            if f.endswith(".v4_gold_conll"):
            if f.endswith(".v4_{}_conll".format(qual)):
                with open(os.path.join(root, f), "r") as document:
                    current_word_id = 0
                    contains_comparative = 0
                    current_doc_text = ""
                    current_doc_id = 0
                    
                    last_pos = ""
                    last_word = ""
                    stack = []
                    np_with_jj = False
                    in_comp = False
                    buff="" #buffer saves annotation after * for next line
                    
                    for line in document:
                        if line.strip():
                            
@@ -35,7 +45,7 @@ def fetch():
                                if contains_comparative:
                                    target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id)
                                    open(target, "w").write(current_doc_text)
                                    print("Writing to "+target+".")
                                    #print("Writing to "+target+".")
                                    contains_comparative = 0
                                    
                                current_doc_text = ""
@@ -43,22 +53,43 @@ def fetch():
                                
                            current_doc_text += line
                            
                            if cols[4] == "JJR":
                            synt = cols[5]
                        
                            new_tags = re.findall("[A-Z]*", synt)
                            open_brackets = synt.count("(")
                            closed_brackets = synt.count(")")
                            
                            stack+=new_tags
                            
                            if "NP" in new_tags:
                                np_start_word_id = current_word_id
                                np_with_jj = []
                                    
                            if cols[4] == "JJ" and "NP" in stack and cols[3] in COMPARATIVES:
                                np_with_jj.append(cols[3])
                                
                            
                            if cols[4] == "JJR" and "NP" in stack:
                                contains_comparative += 1
                                jjr_count += 1
                                print("JJR")
                            elif cols[4] == "JJ" and last_pos == "RBR":
                            elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack:
                                contains_comparative += 1
                                rbr_count += 1
                                print("RBR-JJ")
                            elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES:
                            elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj:
                                contains_comparative += 1
                                jj_count += 1
                                print("JJ-NN")
                            elif cols[3].lower() == "others" and "NP" in stack:
                                contains_comparative += 1
                                others_count += 1
                                
                            last_pos = cols[4]
                            last_word = cols[3]
                            
                            if closed_brackets > 0:
                                del stack[-1*closed_brackets:]
                            
                            current_word_id += 1
                            
                        else: 
                            current_doc_text += line
                    
@@ -66,9 +97,16 @@ def fetch():
                    if contains_comparative:
                        target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id)
                        open(target, "w").write(current_doc_text)
                        print("Writing to "+target+".")
                        
    print(jjr_count, rbr_count, jj_count)
    print(jjr_count, rbr_count, jj_count, others_count)

if __name__ == "__main__":
    fetch()
    PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
    TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/"
    fetch(PATH, TARGET_PATH)
    PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/test/data/english/"
    TARGET_PATH = "/proj/zimmermann/comparatives/corpus/test/"
    fetch(PATH, TARGET_PATH, "auto")
    PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/development/data/english/"
    TARGET_PATH = "/proj/zimmermann/comparatives/corpus/dev/"
    fetch(PATH, TARGET_PATH)
+114 −15
Original line number Diff line number Diff line
@@ -4,32 +4,52 @@ Fetches all documents including comparative constructions.

import os
import sys
import re

PATH = "/proj/zimmermann/comparatives/corpus/"
TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/"
COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

def format():
    
    c = [0,0,0,0]
    
    for root, dirs, files in os.walk(PATH):
        for f in files:
            with open(os.path.join(root, f), "r") as document:
                current_word_id = 0
                markable_id = 0
                comp_id = -1
                set_id = 0
                
                jjr_count = 0
                rbr_count = 0
                jj_count = 0
                
                mmax = open(TARGET_PATH+f+".mmax",'w')
                
                mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f))
                
                mmax.close()
                
                xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w')
                
                xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>')
                
                comp_level = open(TARGET_PATH+"Markables/"+f+"_comparison_level.xml",'w')
                comp_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/comparison">')
                
                coref_level = open(TARGET_PATH+"Markables/"+f+"_coref_level.xml",'w')
                coref_level.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE markables SYSTEM "markables.dtd">\n<markables xmlns="www.eml.org/NameSpaces/coref">')
                
                
                
                last_pos = ""
                last_word = ""
                stack = list()
                coref_stack = dict()
                np_with_jj = False
                comp_word = None
                in_comp = 0
                buff="" #buffer saves annotation after * for next line
                
                for line in document:
                    if line.strip():
                        
@@ -40,23 +60,102 @@ def format():

                        xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3]))
                        
                        if cols[4] == "JJR":
                            jjr_count += 1
                            print("JJR")
                        elif cols[4] == "JJ" and last_pos == "RBR":
                            rbr_count += 1
                            print("RBR-JJ")
                        elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES:
                            jj_count += 1
                            print("JJ-NN")
                        
                        synt = cols[5]
                        
                        new_tags = re.findall("[A-Z]+", synt)
                        open_brackets = synt.count("(")
                        closed_brackets = synt.count(")")
                        
                        stack+=new_tags
                        
                        if "NP" in new_tags:
                            np_start_word_id = current_word_id
                            np_with_jj = False
                        
                        if closed_brackets and "NP" in stack[-1*closed_brackets:] and in_comp and np_start_word_id <= comp_id <= current_word_id:
                            comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, np_start_word_id, current_word_id, set_id))
                            in_comp = False
                            markable_id += 1
                            set_id += 1
                    
                        
                        if cols[4] == "JJ" and cols[3] in COMPARATIVES:
                            np_with_jj = True
                            comp_id = current_word_id
                                
                        if cols[4] == "JJR" and "NP" in stack:
                            set_id += 1
                            comp_id = current_word_id
                            in_comp = True
                            c[0] += 1
                            comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric"  comparison_type="comparative"  comparison_class="set_{}"  mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id))
                            markable_id += 1
                        elif cols[4] == "JJ" and last_pos == "RBR" and "NP" in stack:
                            set_id += 1
                            comp_id = current_word_id
                            in_comp = True
                            c[1] += 1
                            comp_level.write('<markable id="markable_{}" span="word_{}..word_{}" comparative_type="anaphoric"  comparison_type="comparative"  comparison_class="set_{}"  mmax_level="comparison" />\n'.format(markable_id, current_word_id-1, current_word_id ,set_id))
                            markable_id += 1
                        elif (cols[4] == "NN" or cols[4] == "NNS") and np_with_jj:
                            set_id += 1
                            in_comp = True
                            c[2] += 1
                            comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric"  comparison_type="comparative"  comparison_class="set_{}"  mmax_level="comparison" />\n'.format(markable_id, comp_id, set_id))
                            markable_id += 1
                            np_with_jj = False
                        elif cols[3].lower() == "others":
                            set_id += 1
                            comp_id = current_word_id
                            c[3] += 1
                            comp_level.write('<markable id="markable_{}" span="word_{}" comparative_type="anaphoric"  comparison_type="comparative"  comparison_class="set_{}"  mmax_level="comparison" />\n'.format(markable_id, current_word_id, set_id))
                            comp_level.write('<markable id="markable_{}" span="word_{}" comparison_type="anaphora" comparison_class="set_{}" mmax_level="comparison"/>\n'.format(markable_id, current_word_id, set_id))
                            markable_id += 1
                            set_id += 1
                            
                        last_pos = cols[4]
                        last_word = cols[3]
                        
                        if closed_brackets > 0:
                            del stack[-1*closed_brackets:]
                        
                        
                        coref_anno = cols[-1]
                        
                        for coref in coref_anno.split("|"):
                            start = False
                            end = False
                            buffr = ""
                            for i, char in enumerate(coref):
                                if char == "(":
                                    start = True
                                elif char == ")":
                                    end = True
                                elif char in "0123456789":
                                    buffr += char
                            if start and end:
                                coref_level.write('<markable id="markable_{}" span="word_{}" agreement="none"  np_form="none"  coref_class="set_{}"  mmax_level="coref"  semantic_class="none"  type="none"  grammatical_role="none" />'.format(markable_id, current_word_id, buffr))
                                markable_id += 1
                            elif start:
                                if buffr not in coref_stack:
                                    coref_stack[buffr] = [current_word_id]
                                else:
                                    coref_stack[buffr].append(current_word_id)
                            elif end:
                                coref_level.write('<markable id="markable_{}" span="word_{}..word_{}" agreement="none"  np_form="none"  coref_class="set_{}"  mmax_level="coref"  semantic_class="none"  type="none"  grammatical_role="none" />'.format(markable_id, coref_stack[buffr][-1], current_word_id, buffr))
                                markable_id += 1
                        
                        current_word_id += 1
                        
                xml.write("</words>")
                xml.close()
                comp_level.write("</markables>")
                comp_level.close()
                coref_level.write("</markables>")
                coref_level.close()
                
    print(jjr_count, rbr_count, jj_count)
        print(c)
    
if __name__ == "__main__":
    format()