From 1c2c2453f6595c6c0a2f2f18703b6fdc35fb0149 Mon Sep 17 00:00:00 2001 From: Utaemon Toyota <toyota@cl.uni-heidelberg.de> Date: Tue, 26 Feb 2019 00:58:42 +0100 Subject: [PATCH] pack code senseval prep --- Cora_Preprocessing/{cora => cora_data}/README | 0 .../{cora => cora_data}/cora.cites | 0 .../{cora => cora_data}/cora.content | 0 .../{cora => cora_data}/cora_small.content | 0 Senseval_Prep/senseval_preprocessing.py | 24 +++++++++++++++++-- 5 files changed, 22 insertions(+), 2 deletions(-) rename Cora_Preprocessing/{cora => cora_data}/README (100%) rename Cora_Preprocessing/{cora => cora_data}/cora.cites (100%) rename Cora_Preprocessing/{cora => cora_data}/cora.content (100%) rename Cora_Preprocessing/{cora => cora_data}/cora_small.content (100%) diff --git a/Cora_Preprocessing/cora/README b/Cora_Preprocessing/cora_data/README similarity index 100% rename from Cora_Preprocessing/cora/README rename to Cora_Preprocessing/cora_data/README diff --git a/Cora_Preprocessing/cora/cora.cites b/Cora_Preprocessing/cora_data/cora.cites similarity index 100% rename from Cora_Preprocessing/cora/cora.cites rename to Cora_Preprocessing/cora_data/cora.cites diff --git a/Cora_Preprocessing/cora/cora.content b/Cora_Preprocessing/cora_data/cora.content similarity index 100% rename from Cora_Preprocessing/cora/cora.content rename to Cora_Preprocessing/cora_data/cora.content diff --git a/Cora_Preprocessing/cora/cora_small.content b/Cora_Preprocessing/cora_data/cora_small.content similarity index 100% rename from Cora_Preprocessing/cora/cora_small.content rename to Cora_Preprocessing/cora_data/cora_small.content diff --git a/Senseval_Prep/senseval_preprocessing.py b/Senseval_Prep/senseval_preprocessing.py index 8a0a567..13f45b0 100644 --- a/Senseval_Prep/senseval_preprocessing.py +++ b/Senseval_Prep/senseval_preprocessing.py @@ -1,3 +1,18 @@ +#!/usr/bin/env python3 + +""" +@author: Utaemon Toyota +@date: 25.2.2019 +@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics +@members: Nadia Arslan, Lyuba Dimitrova, Nicolas Weber, Utaemon Toyota +@required data: Senseval english-all-word test data and their penn treebank files in the same directory. +@usage: python3 senseval_preprocessing.py [-s] [-g] [-v] + -s / --stopwords Path to txt-file with stopwords + -g / --gloss Path to txt-file with gloss mappings + -v / --version valid input: 2 or 3 for senseval 2 / 3 +""" + +import argparse import re import pickle as pkl from nltk.stem import WordNetLemmatizer @@ -228,7 +243,7 @@ def get_sats(tokens, info): new_info.append(info[idx]) return [new_tokens, new_info] -def write_pkl(version = 2, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"): +def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"): file_path = "" tree_path = "" if version == 2: @@ -245,4 +260,9 @@ def write_pkl(version = 2, stop_path="stopwords.txt", gloss_path = "gloss_mappin print (key, "Done") if __name__ == "__main__": - write_pkl(version=3) + parser = argparse.ArgumentParser(description="Senseval Preprocessing script.") + parser.add_argument("-s", "--stopwords", default="stopwords.txt", help="path to stopwords-txt-file") + parser.add_argument("-g", "--gloss", default="gloss_mapping.txt", help = "path to gloss mapping txt-file") + parser.add_argument("-v", "--version", default = 3, help="2 or 3 for senseval version") + args = parser.parse_args() + write_pkl(version=int(args.version), stop_path=args.stopwords, gloss_path=args.gloss) -- GitLab