Commit 8579d04f authored by axtimhaus's avatar axtimhaus
Browse files

Add code to fetch documents (probably) containing comparatives

parent 13be835a
Loading
Loading
Loading
Loading

comparatives/fetch.py

0 → 100644
+74 −0
Original line number Diff line number Diff line
"""
Fetches all documents including comparative constructions.
"""

import os
import sys

PATH = "/resources/corpora/multilingual/ontonotes-5.0-conll-2012/conll-2012/v4/data/train/data/english/"
TARGET_PATH = "/proj/zimmermann/comparatives/corpus/train/"
COMPARATIVES = ["other", "similar", "comparable", "different", "additional", "extra"]

def fetch():
    jjr_count = 0
    rbr_count = 0
    jj_count = 0
    for root, dirs, files in os.walk(PATH):
        for f in files:
            if f.endswith(".v4_gold_conll"):
                with open(os.path.join(root, f), "r") as document:
                    contains_comparative = 0
                    current_doc_text = ""
                    current_doc_id = 0
                    last_pos = ""
                    last_word = ""
                    for line in document:
                        if line.strip():
                            
                            if line.startswith("#"):
                                continue
                            
                            cols = line.strip().split()
                            
                            if int(cols[1]) == current_doc_id + 1:
                                
                                if contains_comparative:
                                    target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id)
                                    open(target, "w").write(current_doc_text)
                                    print("Writing to "+target+".")
                                    contains_comparative = 0
                                    
                                current_doc_text = ""
                                current_doc_id += 1
                                
                            current_doc_text += line
                            
                            if cols[4] == "JJR":
                                contains_comparative += 1
                                jjr_count += 1
                                print("JJR")
                            elif cols[4] == "JJ" and last_pos == "RBR":
                                contains_comparative += 1
                                rbr_count += 1
                                print("RBR-JJ")
                            elif (cols[4] == "NN" or cols[4] == "NNS") and last_pos == "JJ" and last_word in COMPARATIVES:
                                contains_comparative += 1
                                jj_count += 1
                                print("JJ-NN")
                                
                            last_pos = cols[4]
                            last_word = cols[3]
                        
                        else: 
                            current_doc_text += line
                    
                    #last document
                    if contains_comparative:
                        target = "{}{}_{}_{}".format(TARGET_PATH,str(contains_comparative),f,current_doc_id)
                        open(target, "w").write(current_doc_text)
                        print("Writing to "+target+".")
                        
    print(jjr_count, rbr_count, jj_count)

if __name__ == "__main__":
    fetch()