From 7ed4414ff5d5e8a526f1eaf1315dd5f02bf217b4 Mon Sep 17 00:00:00 2001 From: Utaemon Toyota <toyota@cl.uni-heidelberg.de> Date: Wed, 27 Feb 2019 21:21:42 +0100 Subject: [PATCH] change paths --- EP/Cora_node_classification/cora.py | 12 ++++++------ .../node_classification.py | 14 ++++++-------- .../random_nodes_for_node_classification.py | 2 +- Senseval_Prep/senseval_preprocessing.py | 18 ++++++++---------- 4 files changed, 21 insertions(+), 25 deletions(-) diff --git a/EP/Cora_node_classification/cora.py b/EP/Cora_node_classification/cora.py index c1a3d95..5f192dd 100644 --- a/EP/Cora_node_classification/cora.py +++ b/EP/Cora_node_classification/cora.py @@ -22,7 +22,7 @@ import pickle as pkl def list_of_classes(): return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"] -def read_file_and_get_nodes(graph_name, path="/cora_data/cora.content"): +def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"): class_list = list_of_classes() max_bow_len = 0 node_mapping = {} @@ -50,7 +50,7 @@ def read_file_and_get_nodes(graph_name, path="/cora_data/cora.content"): graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))} return node_mapping -def read_file_and_get_edges(graph_name, node_mapping, path="/cora_data/cora.cites"): +def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"): with open(path) as file: for line in file.readlines(): a, b = line.split() @@ -112,7 +112,7 @@ def add_max_values_to_graph(path_nodes, path_edges): #update Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges) return Cora_graph -def write_pickle_graph_file(path_nodes="/cora_data/cora.content", path_edges="/cora_data/cora.cites", path_output_graph = ""): +def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"): g = add_max_values_to_graph(path_nodes, path_edges) with open(path_output_graph + "graph.pkl", "wb") as output: pkl.dump(g, output) @@ -125,8 +125,8 @@ def read_pickle_graph(path = "graph.pkl"): #will be used on node_classificati if __name__ == "__main__": # execute only if run as a script parser = argparse.ArgumentParser(description="Skript for building cora graph.") - parser.add_argument("-n", "--nodes", default="/cora_data/cora.content", help="path to file containing cora nodes") - parser.add_argument("-e", "--edges", default="/home/utaemon/SP/cora/cora.cites", help="path to file containing edges/citations") - parser.add_argument("-o", "--output", default="", help="path where the graph should be saved") + parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes") + parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations") + parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved") args = parser.parse_args() write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output) diff --git a/EP/Cora_node_classification/node_classification.py b/EP/Cora_node_classification/node_classification.py index 1ab1315..8588350 100644 --- a/EP/Cora_node_classification/node_classification.py +++ b/EP/Cora_node_classification/node_classification.py @@ -1,8 +1,6 @@ #!/usr/bin/env python3 """ -@author: Utaemon Toyota -@date: 31.1.2019 @project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics @requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data @usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n] @@ -45,7 +43,7 @@ def get_class_list(path): return np.array(class_list) #------------------------classification -def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"): +def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/graph.pkl", path_emb = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl"): training_nodes = training(path_graph, seed=seed, num = num) emb = get_embeddings(path_emb) cl = get_class_list(path_graph) @@ -72,7 +70,7 @@ def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "gra print (score) return score -def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"): +def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl"): C_li = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0] for i in range(range_seeds): print ("Iteration/Random Seed:", i) @@ -80,7 +78,7 @@ def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_gr classify(C, seed=i, num = num, num_test_instances = num_test_instances, path_graph = path_graph, path_emb = path_emb) #------------------------Node Classification -def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1): +def node_classification(path_graph = "/../../data/cora/graph/graph.pkl", path_embeddings = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1): logisticRegr = LogisticRegression(C=C, solver='liblinear', multi_class='ovr') training_nodes = training(path_graph, seed=seed, num = num_per_class) emb = get_embeddings(path_embeddings) @@ -113,7 +111,7 @@ def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embedd #print ("Confusion Matrix:\n", conf_matrix) return score_macro, conf_matrix -def node_classification_random_seeds(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1): +def node_classification_random_seeds(path_graph = "/../../data/cora/graph/graph.pkl", path_embeddings = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1): scores = [] for i in range (0,iterations): scores.append(node_classification(path_graph = path_graph, path_embeddings = path_embeddings, num_test_instances = num_test_instances, seed=i, num_per_class = num_per_class, C=C)[0]) @@ -133,8 +131,8 @@ if __name__ == "__main__": # execute only if run as a script parser = argparse.ArgumentParser(description="Node Classification script.") - parser.add_argument("-g", "--graph", default = "graph.pkl", help="path to graph") - parser.add_argument("-e", "--embeddings", default = "cora_embeddings_uniform_m20.pkl", help="path to embeddings") + parser.add_argument("-g", "--graph", default = "/../../data/cora/graph/graph.pkl", help="path to graph") + parser.add_argument("-e", "--embeddings", default = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl", help="path to embeddings") parser.add_argument("-s", "--seed", type=int, help="random seed for one node classification. If this will be specified, always the function node_classification() will be executed.") parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.") parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training") diff --git a/EP/Cora_node_classification/random_nodes_for_node_classification.py b/EP/Cora_node_classification/random_nodes_for_node_classification.py index b4ae51d..9cea5b7 100644 --- a/EP/Cora_node_classification/random_nodes_for_node_classification.py +++ b/EP/Cora_node_classification/random_nodes_for_node_classification.py @@ -27,7 +27,7 @@ def get_random_num_nodes(set_elm, num, seed): random.seed(seed) return set(random.sample(set_elm, num)) -def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed = 1): +def get_num_random_nodes_for_all_classes_read(path = "/../../data/cora/graph/graph.pkl", num = 20, seed = 1): """get specific number of nodes per class, same number for all classes""" cora_dict = dict_of_node_classes_read(path) sampled_random_id_set = set() diff --git a/Senseval_Prep/senseval_preprocessing.py b/Senseval_Prep/senseval_preprocessing.py index c24bf89..65c9ae9 100644 --- a/Senseval_Prep/senseval_preprocessing.py +++ b/Senseval_Prep/senseval_preprocessing.py @@ -1,10 +1,8 @@ #!/usr/bin/env python3 """ -@author: Utaemon Toyota -@date: 25.2.2019 @project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics -@members: Nadia Arslan, Lyuba Dimitrova, Nicolas Weber, Utaemon Toyota +@members: Nadia Arslan, Lyuba Dimitrova, Utaemon Toyota, Nicolas Weber @required data: Senseval english-all-word test data and their penn treebank files in the same directory. @usage: python3 senseval_preprocessing.py [-s] [-g] [-v] -s / --stopwords Path to txt-file with stopwords @@ -18,11 +16,11 @@ import pickle as pkl from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() -file_path2 = "Senseval2/eng-all-words_seneval2.test.xml" #senseval2 -file_path3 = "Senseval3/english-all-words.xml" #senseval3 +file_path2 = "/../../data/senseval2/raw/eng-all-words_seneval2.test.xml" #senseval2 +file_path3 = "/../../data/senseval3/raw/english-all-words.xml" #senseval3 -tree_paths2 = {"d00": "Senseval2/wsj_0089.mrg", "d01": "Senseval2/wsj_0465.mrg", "d02": "Senseval2/wsj_1286.mrg"} #senseval2 -tree_paths3 = {"d000": "Senseval3/cl23.mrg", "d001": "Senseval3/wsj_1695.mrg", "d002":"Senseval3/wsj_1778.mrg"} #senseval3 +tree_paths2 = {"d00": "/../../data/senseval2/raw/wsj_0089.mrg", "d01": "/../../data/senseval2/raw/wsj_0465.mrg", "d02": "/../../data/senseval2/raw/wsj_1286.mrg"} #senseval2 +tree_paths3 = {"d000": "/../../data/senseval3/raw/cl23.mrg", "d001": "/../../data/senseval3/raw/wsj_1695.mrg", "d002":"/../../data/senseval3/raw/wsj_1778.mrg"} #senseval3 def get_stopword_list(stop_path): with open (stop_path, "r") as f: @@ -246,7 +244,7 @@ def get_sats(tokens, info): new_info.append(info[idx]) return [new_tokens, new_info] -def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"): +def write_pkl(version = 3, stop_path="/../../data/other/stopwords.txt", gloss_path = "/../../data/wordnet/mappings/gloss_mapping.txt"): file_path = "" tree_path = "" if version == 2: @@ -264,8 +262,8 @@ def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mappin if __name__ == "__main__": parser = argparse.ArgumentParser(description="Senseval Preprocessing script.") - parser.add_argument("-s", "--stopwords", default="stopwords.txt", help="path to stopwords-txt-file") - parser.add_argument("-g", "--gloss", default="gloss_mapping.txt", help = "path to gloss mapping txt-file") + parser.add_argument("-s", "--stopwords", default="/../../data/other/stopwords.txt", help="path to stopwords-txt-file") + parser.add_argument("-g", "--gloss", default="/../../data/wordnet/mappings/gloss_mapping.txt", help = "path to gloss mapping txt-file") parser.add_argument("-v", "--version", default = 3, help="2 or 3 for senseval version") args = parser.parse_args() write_pkl(version=int(args.version), stop_path=args.stopwords, gloss_path=args.gloss) -- GitLab