diff --git a/Senseval_Prep/README.md b/Senseval_Prep/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a650bf425f1261ab94495beafbb41923ca37a5a6 --- /dev/null +++ b/Senseval_Prep/README.md @@ -0,0 +1,44 @@ +# AUTHORS +Lyuba Dimitrova, Nadia Arslan, Nicolas Weber, Utaemon Toyota + +# PROJECT +Softwareprojekt WS2018/19 +Betreuerin: Prof. Dr. Anette Frank +Graph Embedding Propagation + +# Senseval Preprocessing + +This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 2. The skript will produce pkl-files for each document in Senseval2/3 named as the document name. +From provided Senseval-english-allword-test-data and their Penntree Bank annotations only the useful information will be filtered out. Lemmas which are not included in glossmappings or listed in stopwords will be deleted. For multiword-expressions, only the tag for the head-token will be saved. Information about their satellites will be discarded. +The resulting pickle file contains 2 lists. The first one contains information about lemma and their tag in a list: [lemma, Penntreebank-tag, wordnet-tag, spacy-tag]. The second one contains the information, if it is a head, a satellite or None: ['head', {'id': ['d000.s000.t001']}]. + +# Provided data +Senseval2 +- Senseval 2 english-all-words test data +- Senseval 2 Penntree Bank data for the test documents (wsj_0089.mrg, wsj_0465.mrg, wsj_1286.mrg) +- Results / Gold mappings for Senseval2 + +Senseval3 +- Senseval 3 english-all-words test data +- Senseval 3 Penntree Bank data for the test documents (cl23.mrg. wsj_1695.mrg, wsj_1778.mrg) +- Results / Gold mappings for Senseval3 + +gloss_mapping.txt +- Copied from WordNet_Preprocessing + +stopwords.txt +- includes stopwords, which will be filtered out + +Python3 skript +- senseval_preprocessing.py + +## Dependencies +re - for regular expression matching +pickle - for saving the resulting lists in a pkl-file +nltk - WordNetLemmatizer from NLTK for lemmatizing + +## Running Instructions +python3 senseval_preprocessing.py [-s] [-g] [-v] + -s / --stopwords Path to txt-file with stopwords + -g / --gloss Path to txt-file with gloss mappings + -v / --version valid input: 2 or 3 for senseval 2 / 3 diff --git a/Senseval_Prep/key_senseval2 b/Senseval_Prep/Senseval2/key_senseval2 similarity index 100% rename from Senseval_Prep/key_senseval2 rename to Senseval_Prep/Senseval2/key_senseval2 diff --git a/Senseval_Prep/EnglishAW.test.key b/Senseval_Prep/Senseval3/EnglishAW.test.key similarity index 100% rename from Senseval_Prep/EnglishAW.test.key rename to Senseval_Prep/Senseval3/EnglishAW.test.key diff --git a/Senseval_Prep/Senseval3/d000.pkl b/Senseval_Prep/Senseval3/d000.pkl new file mode 100644 index 0000000000000000000000000000000000000000..65d1f7d0d9a5e5741261cca940ad2499fefe9607 Binary files /dev/null and b/Senseval_Prep/Senseval3/d000.pkl differ diff --git a/Senseval_Prep/Senseval3/d001.pkl b/Senseval_Prep/Senseval3/d001.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fcea20bb35d1084d6aa8874ebdad1727b8cf3b3b Binary files /dev/null and b/Senseval_Prep/Senseval3/d001.pkl differ diff --git a/Senseval_Prep/Senseval3/d002.pkl b/Senseval_Prep/Senseval3/d002.pkl new file mode 100644 index 0000000000000000000000000000000000000000..36ef571fc617ee8f86d64c9bcf92ae8a212d0f93 Binary files /dev/null and b/Senseval_Prep/Senseval3/d002.pkl differ diff --git a/Senseval_Prep/Senseval3/gloss_mapping.txt b/Senseval_Prep/gloss_mapping.txt similarity index 100% rename from Senseval_Prep/Senseval3/gloss_mapping.txt rename to Senseval_Prep/gloss_mapping.txt diff --git a/Senseval_Prep/senseval_preprocessing.py b/Senseval_Prep/senseval_preprocessing.py index 13f45b0621358bc420c12a256209409dad699d3c..4389e11e96c129eb54a22bcbbd0da5a2fc9992c6 100644 --- a/Senseval_Prep/senseval_preprocessing.py +++ b/Senseval_Prep/senseval_preprocessing.py @@ -18,11 +18,11 @@ import pickle as pkl from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() -file_path2 = "eng-all-words_seneval2.test.xml" #senseval2 -file_path3 = "english-all-words.xml" #senseval3 +file_path2 = "Senseval2/eng-all-words_seneval2.test.xml" #senseval2 +file_path3 = "Senseval3/english-all-words.xml" #senseval3 -tree_paths2 = {"d00": "wsj_0089.mrg", "d01": "wsj_0465.mrg", "d02": "wsj_1286.mrg"} #senseval2 -tree_paths3 = {"d000": "cl23.mrg", "d001": "wsj_1695.mrg", "d002":"wsj_1778.mrg"} #senseval3 +tree_paths2 = {"d00": "Senseval2/wsj_0089.mrg", "d01": "Senseval2/wsj_0465.mrg", "d02": "Senseval2/wsj_1286.mrg"} #senseval2 +tree_paths3 = {"d000": "Senseval3/cl23.mrg", "d001": "Senseval3/wsj_1695.mrg", "d002":"Senseval3/wsj_1778.mrg"} #senseval3 def get_stopword_list(stop_path): with open (stop_path, "r") as f: diff --git a/Senseval_Prep/Senseval3/stopwords.txt b/Senseval_Prep/stopwords.txt similarity index 99% rename from Senseval_Prep/Senseval3/stopwords.txt rename to Senseval_Prep/stopwords.txt index 0d3200664e07114109d3c299b33387cee0b7c5cd..4be5c8ca8accd90c89fa467b837be4c0aceb7650 100644 --- a/Senseval_Prep/Senseval3/stopwords.txt +++ b/Senseval_Prep/stopwords.txt @@ -1,5 +1,6 @@ n't 'm +whether i me my