Commit 58377f71 authored by axtimhaus's avatar axtimhaus
Browse files

Add formatting script for conll to annotation format

parent 5798238d
Loading
Loading
Loading
Loading

comparatives/format.py

0 → 100644
+62 −0
Original line number Diff line number Diff line
"""
Fetches all documents including comparative constructions.
"""

import os
import sys

PATH = "/proj/zimmermann/comparatives/corpus/"
TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/"
COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

def format():
    for root, dirs, files in os.walk(PATH):
        for f in files:
            with open(os.path.join(root, f), "r") as document:
                current_word_id = 0
                jjr_count = 0
                rbr_count = 0
                jj_count = 0
                
                mmax = open(TARGET_PATH+f+".mmax",'w')
                
                mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f))
                
                mmax.close()
                
                xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w')
                
                xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>')
                
                last_pos = ""
                last_word = ""
                for line in document:
                    if line.strip():
                        
                        if line.startswith("#"):
                            continue
                        
                        cols = line.strip().split()

                        xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3]))
                        
                        if cols[4] == "JJR":
                            jjr_count += 1
                            print("JJR")
                        elif cols[4] == "JJ" and last_pos == "RBR":
                            rbr_count += 1
                            print("RBR-JJ")
                        elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES:
                            jj_count += 1
                            print("JJ-NN")
                            
                        last_pos = cols[4]
                        last_word = cols[3]
                 
                xml.write("</words>")
                xml.close()
                
    print(jjr_count, rbr_count, jj_count)

if __name__ == "__main__":
    format()