pack Senseval_Prep and add README

724bff2b · toyota · 1c2c2453 · 724bff2b · 724bff2b · 724bff2b
Commit 724bff2b authored 6 years ago by toyota
--- a/Senseval_Prep/README.md
+++ b/Senseval_Prep/README.md
+# AUTHORS
+Lyuba Dimitrova, Nadia Arslan, Nicolas Weber, Utaemon Toyota
+
+# PROJECT
+Softwareprojekt WS2018/19
+Betreuerin: Prof. Dr. Anette Frank
+Graph Embedding Propagation
+
+# Senseval Preprocessing
+
+This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 2. The skript will produce pkl-files for each document in Senseval2/3 named as the document name.
+From provided Senseval-english-allword-test-data and their Penntree Bank annotations only the useful information will be filtered out. Lemmas which are not included in glossmappings or listed in stopwords will be deleted. For multiword-expressions, only the tag for the head-token will be saved. Information about their satellites will be discarded.
+The resulting pickle file contains 2 lists. The first one contains information about lemma and their tag in a list: [lemma, Penntreebank-tag, wordnet-tag, spacy-tag]. The second one contains the information, if it is a head, a satellite or None: ['head',  {'id': ['d000.s000.t001']}].
+
+# Provided data
+Senseval2
+- Senseval 2 english-all-words test data
+- Senseval 2 Penntree Bank data for the test documents (wsj_0089.mrg, wsj_0465.mrg, wsj_1286.mrg)
+- Results / Gold mappings for Senseval2
+
+Senseval3
+- Senseval 3 english-all-words test data
+- Senseval 3 Penntree Bank data for the test documents (cl23.mrg. wsj_1695.mrg, wsj_1778.mrg)
+- Results / Gold mappings for Senseval3
+
+gloss_mapping.txt
+- Copied from WordNet_Preprocessing
+
+stopwords.txt
+- includes stopwords, which will be filtered out
+
+Python3 skript
+- senseval_preprocessing.py
+
+## Dependencies
+re 	- for regular expression matching
+pickle 	- for saving the resulting lists in a pkl-file
+nltk	- WordNetLemmatizer from NLTK for lemmatizing
+
+## Running Instructions
+python3 senseval_preprocessing.py [-s] [-g] [-v]
+        -s / --stopwords    Path to txt-file with stopwords
+        -g / --gloss        Path to txt-file with gloss mappings
+        -v / --version      valid input: 2 or 3 for senseval 2 / 3
--- a/Senseval_Prep/key_senseval2
+++ b/Senseval_Prep/key_senseval2
--- a/Senseval_Prep/EnglishAW.test.key
+++ b/Senseval_Prep/EnglishAW.test.key
--- a/Senseval_Prep/Senseval3/d000.pkl
+++ b/Senseval_Prep/Senseval3/d000.pkl
--- a/Senseval_Prep/Senseval3/d001.pkl
+++ b/Senseval_Prep/Senseval3/d001.pkl
--- a/Senseval_Prep/Senseval3/d002.pkl
+++ b/Senseval_Prep/Senseval3/d002.pkl
--- a/Senseval_Prep/Senseval3/gloss_mapping.txt
+++ b/Senseval_Prep/Senseval3/gloss_mapping.txt
--- a/Senseval_Prep/senseval_preprocessing.py
+++ b/Senseval_Prep/senseval_preprocessing.py
@@ -18,11 +18,11 @@ import pickle as pkl
 from nltk.stem import WordNetLemmatizer
 wnl = WordNetLemmatizer()

-file_path2 = "eng-all-words_seneval2.test.xml"           #senseval2
-file_path3 = "english-all-words.xml"                    #senseval3
+file_path2 = "Senseval2/eng-all-words_seneval2.test.xml"           #senseval2
+file_path3 = "Senseval3/english-all-words.xml"                    #senseval3

-tree_paths2 = {"d00": "wsj_0089.mrg", "d01": "wsj_0465.mrg", "d02": "wsj_1286.mrg"}      #senseval2
-tree_paths3 = {"d000": "cl23.mrg", "d001": "wsj_1695.mrg", "d002":"wsj_1778.mrg"}       #senseval3
+tree_paths2 = {"d00": "Senseval2/wsj_0089.mrg", "d01": "Senseval2/wsj_0465.mrg", "d02": "Senseval2/wsj_1286.mrg"}      #senseval2
+tree_paths3 = {"d000": "Senseval3/cl23.mrg", "d001": "Senseval3/wsj_1695.mrg", "d002":"Senseval3/wsj_1778.mrg"}       #senseval3

 def get_stopword_list(stop_path):
    with open (stop_path, "r") as f:

--- a/Senseval_Prep/Senseval3/stopwords.txt
+++ b/Senseval_Prep/Senseval3/stopwords.txt
 n't
 'm
+whether
 i
 me
 my