undo

3dba9dfb · toyota · 7a37af64 · 3dba9dfb
Commit 3dba9dfb authored 6 years ago by toyota
--- a/scripts/preprocessing/cora/cora.py
+++ b/scripts/preprocessing/cora/cora.py
 """
-@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
+Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
-@requirements: cora data, numpy, networkX, pickle
+Initialize Embeddings for n dimensions with initialize-module.
-@info
+Arrays are initialized in normal or uniform random format (default = normal).
-Getting a networkx graph from Cora. Graph will be saved in a pickle file.
-@usage
+#Usage
-python3 cora.py [-n] [-e] [-o]
+get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
-	-n / --nodes	Path to cora file containing nodes
+-> return graph with nodes and edges
-	-e / --edges	Path to cora file containing edges
+To write the graph informations in file:
-	-o / --output	Path where the graph should be saved
+def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/")
+To write the dictionary with initalizing Embeddings in file:
+def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/")
 """
-import argparse
 import networkx as nx
 import numpy as np
 import pickle as pkl
+import os
 def list_of_classes():
    return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]
-def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"):
+def read_file_and_get_nodes(graph_name, path):
    class_list = list_of_classes()
    max_bow_len = 0
    node_mapping = {}
@@ -47,7 +49,7 @@ def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"
    graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
    return node_mapping
-def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"):
+def read_file_and_get_edges(graph_name, node_mapping, path):
    with open(path) as file:
        for line in file.readlines():
            a, b = line.split()
@@ -55,6 +57,7 @@ def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw
 #---------------------create graph--------------
 def get_graph(path_nodes, path_edges):
    Cora_graph = nx.Graph()
    node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes)
@@ -108,21 +111,26 @@ def add_max_values_to_graph(path_nodes, path_edges):                    #update
    Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
    return Cora_graph
-def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"):
+# not used, initialization happens in EP
+'''
+def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
+    return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
+'''
+def write_pickle_graph_file(path_nodes, path_edges, output_path):
    g = add_max_values_to_graph(path_nodes, path_edges)
-    with open(path_output_graph + "cora_graph.pkl", "wb") as output:
+    path = os.path.split(output_path)[0]
+    if not os.path.exists(path):
+        os.mkdir(path)
+    with open(output_path, "wb") as output:
        pkl.dump(g, output)
-def read_pickle_graph(path = "graph.pkl"):				#will be used on node_classification.py for accessing the graph
-    with open(path, 'rb') as f:
-        graph = pkl.load(f)
-    return graph
+'''
 if __name__ == "__main__":
    # execute only if run as a script
-    parser = argparse.ArgumentParser(description="Skript for building cora graph.")
+    get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
-    parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes")
+    # get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
-    parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations")
+'''
-    parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved")
-    args = parser.parse_args()
-    write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output)