diff --git a/scripts/preprocessing/cora/cora.py b/scripts/preprocessing/cora/cora.py index ae855e63e239cf3e761c1dabe66d23178755af3b..5b931c78a41ed09cddb100c052f61eb9ba352d40 100644 --- a/scripts/preprocessing/cora/cora.py +++ b/scripts/preprocessing/cora/cora.py @@ -1,25 +1,27 @@ """ -@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics -@requirements: cora data, numpy, networkX, pickle -@info -Getting a networkx graph from Cora. Graph will be saved in a pickle file. - -@usage -python3 cora.py [-n] [-e] [-o] - -n / --nodes Path to cora file containing nodes - -e / --edges Path to cora file containing edges - -o / --output Path where the graph should be saved +Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable). +Initialize Embeddings for n dimensions with initialize-module. +Arrays are initialized in normal or uniform random format (default = normal). + + +#Usage +get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites") +-> return graph with nodes and edges +To write the graph informations in file: +def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/") +To write the dictionary with initalizing Embeddings in file: +def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/") """ -import argparse import networkx as nx import numpy as np import pickle as pkl +import os def list_of_classes(): return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"] -def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"): +def read_file_and_get_nodes(graph_name, path): class_list = list_of_classes() max_bow_len = 0 node_mapping = {} @@ -47,7 +49,7 @@ def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content" graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))} return node_mapping -def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"): +def read_file_and_get_edges(graph_name, node_mapping, path): with open(path) as file: for line in file.readlines(): a, b = line.split() @@ -55,6 +57,7 @@ def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw #---------------------create graph-------------- + def get_graph(path_nodes, path_edges): Cora_graph = nx.Graph() node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes) @@ -108,21 +111,26 @@ def add_max_values_to_graph(path_nodes, path_edges): #update Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges) return Cora_graph -def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"): +# not used, initialization happens in EP +''' +def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433): + return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity) +''' + +def write_pickle_graph_file(path_nodes, path_edges, output_path): g = add_max_values_to_graph(path_nodes, path_edges) - with open(path_output_graph + "cora_graph.pkl", "wb") as output: + + path = os.path.split(output_path)[0] + if not os.path.exists(path): + os.mkdir(path) + + with open(output_path, "wb") as output: pkl.dump(g, output) -def read_pickle_graph(path = "graph.pkl"): #will be used on node_classification.py for accessing the graph - with open(path, 'rb') as f: - graph = pkl.load(f) - return graph +''' if __name__ == "__main__": # execute only if run as a script - parser = argparse.ArgumentParser(description="Skript for building cora graph.") - parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes") - parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations") - parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved") - args = parser.parse_args() - write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output) + get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites") + # get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433) +'''