Loading comparatives/format.py 0 → 100644 +62 −0 Original line number Diff line number Diff line """ Fetches all documents including comparative constructions. """ import os import sys PATH = "/proj/zimmermann/comparatives/corpus/" TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def format(): for root, dirs, files in os.walk(PATH): for f in files: with open(os.path.join(root, f), "r") as document: current_word_id = 0 jjr_count = 0 rbr_count = 0 jj_count = 0 mmax = open(TARGET_PATH+f+".mmax",'w') mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f)) mmax.close() xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w') xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>') last_pos = "" last_word = "" for line in document: if line.strip(): if line.startswith("#"): continue cols = line.strip().split() xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3])) if cols[4] == "JJR": jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: jj_count += 1 print("JJ-NN") last_pos = cols[4] last_word = cols[3] xml.write("</words>") xml.close() print(jjr_count, rbr_count, jj_count) if __name__ == "__main__": format() Loading
comparatives/format.py 0 → 100644 +62 −0 Original line number Diff line number Diff line """ Fetches all documents including comparative constructions. """ import os import sys PATH = "/proj/zimmermann/comparatives/corpus/" TARGET_PATH = "/proj/zimmermann/comparatives/annotation/OntoNotes/" COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"] def format(): for root, dirs, files in os.walk(PATH): for f in files: with open(os.path.join(root, f), "r") as document: current_word_id = 0 jjr_count = 0 rbr_count = 0 jj_count = 0 mmax = open(TARGET_PATH+f+".mmax",'w') mmax.write('<?xml version="1.0"?>\n<mmax_project>\n<sentences></sentences>\n<words>{}.xml</words>\n<gestures></gestures>\n<keyactions></keyactions>\n</mmax_project>'.format(f)) mmax.close() xml = open(TARGET_PATH+"Basedata/"+f+".xml",'w') xml.write('<?xml version="1.0" encoding="US-ASCII"?>\n<!DOCTYPE words SYSTEM "words.dtd">\n<words>') last_pos = "" last_word = "" for line in document: if line.strip(): if line.startswith("#"): continue cols = line.strip().split() xml.write('<word id="word_{}">{}</word>\n'.format(current_word_id,cols[3])) if cols[4] == "JJR": jjr_count += 1 print("JJR") elif cols[4] == "JJ" and last_pos == "RBR": rbr_count += 1 print("RBR-JJ") elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES: jj_count += 1 print("JJ-NN") last_pos = cols[4] last_word = cols[3] xml.write("</words>") xml.close() print(jjr_count, rbr_count, jj_count) if __name__ == "__main__": format()