diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e9a704ee556bf066004e380e3d16532c52d78984 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +This software is distributed under the MIT License. + +MIT License + +Copyright (c) 2019 Nadia Arslan, Lyuba Dimitrova, Utaemon Toyota, Nicolas Weber +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/scripts/node_classification/nc_experiment.py b/scripts/node_classification/nc_experiment.py index 88859502c8b5c499ff3ad57eef60e83ccd4bd600..3166771763d5ae42c53016ff2de9f752b1122b45 100644 --- a/scripts/node_classification/nc_experiment.py +++ b/scripts/node_classification/nc_experiment.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 """ -@author: Utaemon Toyota -@date: 31.1.2019 @project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics @requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data @usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n] @@ -62,7 +60,7 @@ def get_random_num_nodes(set_elm, num, seed): random.seed(seed) return set(random.sample(set_elm, num)) -def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed = 1): +def get_num_random_nodes_for_all_classes_read(path = "/../../data/cora/graph/cora_graph.pkl", num = 20, seed = 1): """get specific number of nodes per class, same number for all classes""" cora_dict = dict_of_node_classes_read(path) sampled_random_id_set = set() @@ -73,7 +71,7 @@ def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed #------------------------classification -def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"): +def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/cora_graph.pkl", path_emb = "/../../data/cora/embeddings/merged_node_embeddings.pkl"): training_nodes = training(path_graph, seed=seed, num = num) emb = get_embeddings(path_emb) cl = get_class_list(path_graph) @@ -100,7 +98,7 @@ def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "gra print (score) return score -def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"): +def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/cora_graph.pkl", path_emb = "/../../data/cora/embeddings/merged_node_embeddings.pkl"): C_li = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0] for i in range(range_seeds): print ("Iteration/Random Seed:", i) @@ -108,7 +106,7 @@ def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_gr classify(C, seed=i, num = num, num_test_instances = num_test_instances, path_graph = path_graph, path_emb = path_emb) #------------------------Node Classification -def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1): +def node_classification(path_graph = "/../../data/cora/graph/cora_graph.pkl", path_embeddings = "/../../data/cora/embeddings/merged_node_embeddings.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1): logisticRegr = LogisticRegression(C=C, solver='liblinear', multi_class='ovr') training_nodes = training(path_graph, seed=seed, num = num_per_class) emb = get_embeddings(path_embeddings) @@ -140,7 +138,7 @@ def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embedd #print ("Confusion Matrix:\n", conf_matrix) return score_macro, conf_matrix -def node_classification_random_seeds(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1): +def node_classification_random_seeds(path_graph = "/../../data/cora/graph/cora_graph.pkl", path_embeddings = "/../../data/cora/embeddings/merged_node_embeddings.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1): scores = [] for i in range (0,iterations): scores.append(node_classification(path_graph = path_graph, path_embeddings = path_embeddings, num_test_instances = num_test_instances, seed=i, num_per_class = num_per_class, C=C)[0]) @@ -160,8 +158,8 @@ if __name__ == "__main__": # execute only if run as a script parser = argparse.ArgumentParser(description="Node Classification script.") - parser.add_argument("-g", "--graph", default = "graph.pkl", help="path to graph") - parser.add_argument("-e", "--embeddings", default = "cora_embeddings_uniform_m20.pkl", help="path to embeddings") + parser.add_argument("-g", "--graph", default = "/../../data/cora/graph/cora_graph.pkl", help="path to graph") + parser.add_argument("-e", "--embeddings", default = "/../../data/cora/embeddings/merged_node_embeddings.pkl", help="path to embeddings") parser.add_argument("-s", "--seed", type=int, help="random seed for one node classification. If this will be specified, always the function node_classification() will be executed.") parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.") parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training") diff --git a/scripts/preprocessing/cora/cora.py b/scripts/preprocessing/cora/cora.py index 8064e638d45c6bdb040805d287a3ea0cdd3b7f81..ae855e63e239cf3e761c1dabe66d23178755af3b 100644 --- a/scripts/preprocessing/cora/cora.py +++ b/scripts/preprocessing/cora/cora.py @@ -1,27 +1,25 @@ """ -Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable). -Initialize Embeddings for n dimensions with initialize-module. -Arrays are initialized in normal or uniform random format (default = normal). - - -#Usage -get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites") --> return graph with nodes and edges -To write the graph informations in file: -def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/") -To write the dictionary with initalizing Embeddings in file: -def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/") +@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics +@requirements: cora data, numpy, networkX, pickle +@info +Getting a networkx graph from Cora. Graph will be saved in a pickle file. + +@usage +python3 cora.py [-n] [-e] [-o] + -n / --nodes Path to cora file containing nodes + -e / --edges Path to cora file containing edges + -o / --output Path where the graph should be saved """ +import argparse import networkx as nx import numpy as np import pickle as pkl -import os def list_of_classes(): return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"] -def read_file_and_get_nodes(graph_name, path): +def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"): class_list = list_of_classes() max_bow_len = 0 node_mapping = {} @@ -49,7 +47,7 @@ def read_file_and_get_nodes(graph_name, path): graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))} return node_mapping -def read_file_and_get_edges(graph_name, node_mapping, path): +def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"): with open(path) as file: for line in file.readlines(): a, b = line.split() @@ -57,7 +55,6 @@ def read_file_and_get_edges(graph_name, node_mapping, path): #---------------------create graph-------------- - def get_graph(path_nodes, path_edges): Cora_graph = nx.Graph() node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes) @@ -111,26 +108,21 @@ def add_max_values_to_graph(path_nodes, path_edges): #update Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges) return Cora_graph -# not used, initialization happens in EP -''' -def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433): - return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity) -''' - -def write_pickle_graph_file(path_nodes, path_edges, output_path): +def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"): g = add_max_values_to_graph(path_nodes, path_edges) - - path = os.path.split(output_path)[0] - if not os.path.exists(path): - os.mkdir(path) - - with open(output_path, "wb") as output: + with open(path_output_graph + "cora_graph.pkl", "wb") as output: pkl.dump(g, output) +def read_pickle_graph(path = "graph.pkl"): #will be used on node_classification.py for accessing the graph + with open(path, 'rb') as f: + graph = pkl.load(f) + return graph -''' if __name__ == "__main__": # execute only if run as a script - get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites") - # get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433) -''' \ No newline at end of file + parser = argparse.ArgumentParser(description="Skript for building cora graph.") + parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes") + parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations") + parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved") + args = parser.parse_args() + write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output) diff --git a/scripts/preprocessing/senseval/README.md b/scripts/preprocessing/senseval/README.md new file mode 100644 index 0000000000000000000000000000000000000000..31940158a1b32a1f3d1d58fe0f8716c261fead6d --- /dev/null +++ b/scripts/preprocessing/senseval/README.md @@ -0,0 +1,53 @@ +# AUTHORS +Lyuba Dimitrova, Nadia Arslan, Nicolas Weber, Utaemon Toyota + +# PROJECT +Softwareprojekt WS2018/19 +Betreuerin: Prof. Dr. Anette Frank +Graph Embedding Propagation + +# Senseval Preprocessing for Method 1 + +This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 1. The skript will produce json-files for SensEval-2 and 3. This files include sentence splitted lists with lemmatized lowered words in a tuple together with the according WordNet3.0 POS-tag. + +# Senseval Preprocessing for Method 2 + +This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 2. The skript will produce pkl-files for each document in Senseval2/3 named as the document name. +From provided Senseval-english-allword-test-data and their Penntree Bank annotations only the useful information will be filtered out. Lemmas which are not included in glossmappings or listed in stopwords will be deleted. For multiword-expressions, only the tag for the head-token will be saved. Information about their satellites will be discarded. +The resulting pickle file contains 2 lists. The first one contains information about lemma and their tag in a list: [lemma, Penntreebank-tag, wordnet-tag, spacy-tag]. The second one contains the information, if it is a head, a satellite or None: ['head', {'id': ['d000.s000.t001']}]. + +# Provided data +Senseval2 +- Senseval 2 english-all-words test data +- Senseval 2 Penntree Bank data for the test documents (wsj_0089.mrg, wsj_0465.mrg, wsj_1286.mrg) +- Results / Gold mappings for Senseval2 + +Senseval3 +- Senseval 3 english-all-words test data +- Senseval 3 Penntree Bank data for the test documents (cl23.mrg. wsj_1695.mrg, wsj_1778.mrg) +- Results / Gold mappings for Senseval3 + +gloss_mapping.txt +- Copied from WordNet_Preprocessing + +stopwords.txt +- includes stopwords, which will be filtered out + +Python3 skripts +- senseval_preprocessing.py +- preprocess_senseval_method1.py + +## Dependencies +re - for regular expression matching +json - for saving the results for WSD method 1 +pickle - for saving the resulting lists in a pkl-file for WSD method 2 +nltk - WordNetLemmatizer from NLTK for lemmatizing + +## Running Instructions Method 1 +python[3] preprocess_senseval_method1.py + +## Running Instructions Method 2 +python[3] senseval_preprocessing.py [-s] [-g] [-v] + -s / --stopwords Path to txt-file with stopwords + -g / --gloss Path to txt-file with gloss mappings + -v / --version valid input: 2 or 3 for senseval 2 / 3