adapt paths

384423aa · toyota · ba50f26c · 384423aa · 384423aa · 384423aa
Commit 384423aa authored 6 years ago by toyota
--- a/LICENSE
+++ b/LICENSE
+This software is distributed under the MIT License.
+
+MIT License
+
+Copyright (c) 2019 Nadia Arslan, Lyuba Dimitrova, Utaemon Toyota, Nicolas Weber
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/scripts/node_classification/nc_experiment.py
+++ b/scripts/node_classification/nc_experiment.py
 #!/usr/bin/env python3

 """
-@author: Utaemon Toyota
-@date: 31.1.2019
 @project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
 @requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data
 @usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n]
@@ -62,7 +60,7 @@ def get_random_num_nodes(set_elm, num, seed):
    random.seed(seed)
    return set(random.sample(set_elm, num))

-def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed = 1):
+def get_num_random_nodes_for_all_classes_read(path = "/../../data/cora/graph/cora_graph.pkl", num = 20, seed = 1):
    """get specific number of nodes per class, same number for all classes"""
    cora_dict = dict_of_node_classes_read(path)
    sampled_random_id_set = set()
@@ -73,7 +71,7 @@ def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed


 #------------------------classification
-def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"):
+def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/cora_graph.pkl", path_emb = "/../../data/cora/embeddings/merged_node_embeddings.pkl"):
    training_nodes = training(path_graph, seed=seed, num = num)
    emb = get_embeddings(path_emb)
    cl = get_class_list(path_graph)
@@ -100,7 +98,7 @@ def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "gra
    print (score)
    return score

-def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"):
+def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/cora_graph.pkl", path_emb = "/../../data/cora/embeddings/merged_node_embeddings.pkl"):
    C_li = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
    for i in range(range_seeds):
        print ("Iteration/Random Seed:", i)
@@ -108,7 +106,7 @@ def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_gr
            classify(C, seed=i, num = num, num_test_instances = num_test_instances, path_graph = path_graph, path_emb = path_emb)

 #------------------------Node Classification
-def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1):
+def node_classification(path_graph = "/../../data/cora/graph/cora_graph.pkl", path_embeddings = "/../../data/cora/embeddings/merged_node_embeddings.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1):
    logisticRegr = LogisticRegression(C=C, solver='liblinear', multi_class='ovr')
    training_nodes = training(path_graph, seed=seed, num = num_per_class)
    emb = get_embeddings(path_embeddings)
@@ -140,7 +138,7 @@ def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embedd
    #print ("Confusion Matrix:\n", conf_matrix)
    return score_macro, conf_matrix

-def node_classification_random_seeds(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1):
+def node_classification_random_seeds(path_graph = "/../../data/cora/graph/cora_graph.pkl", path_embeddings = "/../../data/cora/embeddings/merged_node_embeddings.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1):
    scores = []
    for i in range (0,iterations):
        scores.append(node_classification(path_graph = path_graph, path_embeddings = path_embeddings, num_test_instances = num_test_instances, seed=i, num_per_class = num_per_class, C=C)[0])
@@ -160,8 +158,8 @@ if __name__ == "__main__":
    # execute only if run as a script

    parser = argparse.ArgumentParser(description="Node Classification script.")
-    parser.add_argument("-g", "--graph", default = "graph.pkl", help="path to graph")
-    parser.add_argument("-e", "--embeddings", default = "cora_embeddings_uniform_m20.pkl", help="path to embeddings")
+    parser.add_argument("-g", "--graph", default = "/../../data/cora/graph/cora_graph.pkl", help="path to graph")
+    parser.add_argument("-e", "--embeddings", default = "/../../data/cora/embeddings/merged_node_embeddings.pkl", help="path to embeddings")
    parser.add_argument("-s", "--seed", type=int, help="random seed for one node classification. If this will be specified, always the function node_classification() will be executed.")
    parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.")
    parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training")

--- a/scripts/preprocessing/cora/cora.py
+++ b/scripts/preprocessing/cora/cora.py
 """
-Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
-Initialize Embeddings for n dimensions with initialize-module.
-Arrays are initialized in normal or uniform random format (default = normal).
-
-
-#Usage
-get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
-> return graph with nodes and edges
-To write the graph informations in file:
-def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/")
-To write the dictionary with initalizing Embeddings in file:
-def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/")
+@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
+@requirements: cora data, numpy, networkX, pickle
+@info
+Getting a networkx graph from Cora. Graph will be saved in a pickle file.
+
+@usage
+python3 cora.py [-n] [-e] [-o]
+	-n / --nodes	Path to cora file containing nodes
+	-e / --edges	Path to cora file containing edges
+	-o / --output	Path where the graph should be saved
 """

+import argparse
 import networkx as nx
 import numpy as np
 import pickle as pkl
-import os

 def list_of_classes():
    return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]

-def read_file_and_get_nodes(graph_name, path):
+def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"):
    class_list = list_of_classes()
    max_bow_len = 0
    node_mapping = {}
@@ -49,7 +47,7 @@ def read_file_and_get_nodes(graph_name, path):
    graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
    return node_mapping

-def read_file_and_get_edges(graph_name, node_mapping, path):
+def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"):
    with open(path) as file:
        for line in file.readlines():
            a, b = line.split()
@@ -57,7 +55,6 @@ def read_file_and_get_edges(graph_name, node_mapping, path):

 #---------------------create graph--------------

-
 def get_graph(path_nodes, path_edges):
    Cora_graph = nx.Graph()
    node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes)
@@ -111,26 +108,21 @@ def add_max_values_to_graph(path_nodes, path_edges):                    #update
    Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
    return Cora_graph

-# not used, initialization happens in EP
-'''
-def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
-    return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
-'''
-
-def write_pickle_graph_file(path_nodes, path_edges, output_path):
+def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"):
    g = add_max_values_to_graph(path_nodes, path_edges)
-
-    path = os.path.split(output_path)[0]
-    if not os.path.exists(path):
-        os.mkdir(path)
-
-    with open(output_path, "wb") as output:
+    with open(path_output_graph + "cora_graph.pkl", "wb") as output:
        pkl.dump(g, output)

+def read_pickle_graph(path = "graph.pkl"):				#will be used on node_classification.py for accessing the graph
+    with open(path, 'rb') as f:
+        graph = pkl.load(f)
+    return graph

-'''
 if __name__ == "__main__":
    # execute only if run as a script
-    get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
-    # get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
-'''
\ No newline at end of file
+    parser = argparse.ArgumentParser(description="Skript for building cora graph.")
+    parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes")
+    parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations")
+    parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved")
+    args = parser.parse_args()
+    write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output)
--- a/scripts/preprocessing/senseval/README.md
+++ b/scripts/preprocessing/senseval/README.md
+# AUTHORS
+Lyuba Dimitrova, Nadia Arslan, Nicolas Weber, Utaemon Toyota
+
+# PROJECT
+Softwareprojekt WS2018/19
+Betreuerin: Prof. Dr. Anette Frank
+Graph Embedding Propagation
+
+# Senseval Preprocessing for Method 1
+
+This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 1. The skript will produce json-files for SensEval-2 and 3. This files include sentence splitted lists with lemmatized lowered words in a tuple together with the according WordNet3.0 POS-tag.
+
+# Senseval Preprocessing for Method 2
+
+This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 2. The skript will produce pkl-files for each document in Senseval2/3 named as the document name.
+From provided Senseval-english-allword-test-data and their Penntree Bank annotations only the useful information will be filtered out. Lemmas which are not included in glossmappings or listed in stopwords will be deleted. For multiword-expressions, only the tag for the head-token will be saved. Information about their satellites will be discarded.
+The resulting pickle file contains 2 lists. The first one contains information about lemma and their tag in a list: [lemma, Penntreebank-tag, wordnet-tag, spacy-tag]. The second one contains the information, if it is a head, a satellite or None: ['head',  {'id': ['d000.s000.t001']}].
+
+# Provided data
+Senseval2
+- Senseval 2 english-all-words test data
+- Senseval 2 Penntree Bank data for the test documents (wsj_0089.mrg, wsj_0465.mrg, wsj_1286.mrg)
+- Results / Gold mappings for Senseval2
+
+Senseval3
+- Senseval 3 english-all-words test data
+- Senseval 3 Penntree Bank data for the test documents (cl23.mrg. wsj_1695.mrg, wsj_1778.mrg)
+- Results / Gold mappings for Senseval3
+
+gloss_mapping.txt
+- Copied from WordNet_Preprocessing
+
+stopwords.txt
+- includes stopwords, which will be filtered out
+
+Python3 skripts
+- senseval_preprocessing.py
+- preprocess_senseval_method1.py
+
+## Dependencies
+re 	- for regular expression matching
+json	- for saving the results for WSD method 1
+pickle 	- for saving the resulting lists in a pkl-file for WSD method 2
+nltk	- WordNetLemmatizer from NLTK for lemmatizing
+
+## Running Instructions Method 1
+python[3] preprocess_senseval_method1.py
+
+## Running Instructions Method 2
+python[3] senseval_preprocessing.py [-s] [-g] [-v]
+        -s / --stopwords    Path to txt-file with stopwords
+        -g / --gloss        Path to txt-file with gloss mappings
+        -v / --version      valid input: 2 or 3 for senseval 2 / 3