diff --git a/Cora_Preprocessing/cora/README b/Cora_Preprocessing/cora_data/README similarity index 100% rename from Cora_Preprocessing/cora/README rename to Cora_Preprocessing/cora_data/README diff --git a/Cora_Preprocessing/cora/cora.cites b/Cora_Preprocessing/cora_data/cora.cites similarity index 100% rename from Cora_Preprocessing/cora/cora.cites rename to Cora_Preprocessing/cora_data/cora.cites diff --git a/Cora_Preprocessing/cora/cora.content b/Cora_Preprocessing/cora_data/cora.content similarity index 100% rename from Cora_Preprocessing/cora/cora.content rename to Cora_Preprocessing/cora_data/cora.content diff --git a/Cora_Preprocessing/cora/cora_small.content b/Cora_Preprocessing/cora_data/cora_small.content similarity index 100% rename from Cora_Preprocessing/cora/cora_small.content rename to Cora_Preprocessing/cora_data/cora_small.content diff --git a/Senseval_Prep/senseval_preprocessing.py b/Senseval_Prep/senseval_preprocessing.py index 8a0a5670e36fbd922b4bfd3f81f4908e241468ef..13f45b0621358bc420c12a256209409dad699d3c 100644 --- a/Senseval_Prep/senseval_preprocessing.py +++ b/Senseval_Prep/senseval_preprocessing.py @@ -1,3 +1,18 @@ +#!/usr/bin/env python3 + +""" +@author: Utaemon Toyota +@date: 25.2.2019 +@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics +@members: Nadia Arslan, Lyuba Dimitrova, Nicolas Weber, Utaemon Toyota +@required data: Senseval english-all-word test data and their penn treebank files in the same directory. +@usage: python3 senseval_preprocessing.py [-s] [-g] [-v] + -s / --stopwords Path to txt-file with stopwords + -g / --gloss Path to txt-file with gloss mappings + -v / --version valid input: 2 or 3 for senseval 2 / 3 +""" + +import argparse import re import pickle as pkl from nltk.stem import WordNetLemmatizer @@ -228,7 +243,7 @@ def get_sats(tokens, info): new_info.append(info[idx]) return [new_tokens, new_info] -def write_pkl(version = 2, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"): +def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"): file_path = "" tree_path = "" if version == 2: @@ -245,4 +260,9 @@ def write_pkl(version = 2, stop_path="stopwords.txt", gloss_path = "gloss_mappin print (key, "Done") if __name__ == "__main__": - write_pkl(version=3) + parser = argparse.ArgumentParser(description="Senseval Preprocessing script.") + parser.add_argument("-s", "--stopwords", default="stopwords.txt", help="path to stopwords-txt-file") + parser.add_argument("-g", "--gloss", default="gloss_mapping.txt", help = "path to gloss mapping txt-file") + parser.add_argument("-v", "--version", default = 3, help="2 or 3 for senseval version") + args = parser.parse_args() + write_pkl(version=int(args.version), stop_path=args.stopwords, gloss_path=args.gloss)