bug fix

ec9bba65 · toyota · 0b0c3aa7 · ec9bba65 · ec9bba65 · ec9bba65
Commit ec9bba65 authored 6 years ago by toyota
--- a/Cora_Preprocessing/cora.py
+++ b/Cora_Preprocessing/cora.py
 """
-Getting a networkx graph from Cora. Graph can be saved in pickle file.
+Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
 Initialize Embeddings for n dimensions with initialize-module.
 Arrays are initialized in normal or uniform random format (default = normal).

@@ -21,39 +21,50 @@ import pickle as pkl
 def list_of_classes():
    return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]

-def read_file_and_get_nodes(graph_name, path="cora/cora.content"):
+def read_file_and_get_nodes(graph_name, path="/home/utaemon/SP/cora/cora.content"):
    class_list = list_of_classes()
    max_bow_len = 0
+    node_mapping = {}
+    length_array = np.empty((0))
    with open(path) as file:
+        counter = 0
        for line in file.readlines():
            split_line = np.array(line.split())
            paper_id, split_line = split_line[0], split_line[1:]
+            node_mapping[paper_id] = counter
+            paper_id = counter
+            counter += 1
            paper_class, all_bow_of_paper = split_line[-1], split_line[:-1]
-            paper_bow = np.where(all_bow_of_paper == "1")              #get indices which words occur
-            if len(paper_bow) > max_bow_len:
-                max_bow_len = len(paper_bow[0]
+            paper_bow = np.where(all_bow_of_paper == "1")               #get indices which words occur
+            length_array = np.append(length_array, len(paper_bow[0]))
+            if len(paper_bow[0]) > max_bow_len:
+                max_bow_len = len(paper_bow[0])
            paper_class = class_list.index(paper_class)                 #get index of class to numeralize
            #add infos to Graph
            graph_name.add_node(int(paper_id))
            graph_name.node[int(paper_id)]["class"] = paper_class
-            graph_name.node[int(paper_id)]["bow"] = paper_bow
-    graph_name.graph["max_bow"] = max_bow_len
-    graph_name.graph["len_bow"] = 1433
+            graph_name.node[int(paper_id)]["bow"] = paper_bow[0]
+            graph_name.node[int(paper_id)]["paper_id"] = [paper_id]
+    graph_name.graph["bow"]={"maxlen": max_bow_len, "vocab": 1433, "lengths": length_array}
+    graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
+    return node_mapping

-def read_file_and_get_edges(graph_name, path="cora/cora.cites"):
+def read_file_and_get_edges(graph_name, node_mapping, path="/home/utaemon/SP/cora/cora.cites"):
    with open(path) as file:
        for line in file.readlines():
            a, b = line.split()
-            graph_name.add_edge(int(a),int(b))
+            graph_name.add_edge(node_mapping[a],node_mapping[b])

-#initialize Graph
-def get_graph(path_nodes="cora/cora.content", path_edges="cora/cora.cites"):
+#---------------------create graph--------------
+
+
+def get_graph(path_nodes, path_edges):
    Cora_graph = nx.Graph()
-    read_file_and_get_nodes(Cora_graph, path_nodes)
-    read_file_and_get_edges(Cora_graph, path_edges)
+    node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes)
+    read_file_and_get_edges(Cora_graph, node_mapping, path_edges)
    return Cora_graph

-#for getting maxima
+#----------for getting maxima--------
 def get_neighbours_dict(path_nodes, path_edges):
    dict_neighbours = {}
    for edge in get_graph(path_nodes, path_edges).edges:
@@ -100,9 +111,11 @@ def add_max_values_to_graph(path_nodes, path_edges):                    #update
    Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
    return Cora_graph

-#write graph into pickle file
+def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
+    return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
+
 def write_pickle_graph_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/"):
-    g = get_graph(path_nodes, path_edges)
+    g = add_max_values_to_graph(path_nodes, path_edges)
    with open(path_output_graph + "graph.pkl", "wb") as output:
        pkl.dump(g, output)

@@ -111,6 +124,8 @@ def read_pickle_graph(path = "/home/utaemon/SP/graph.pkl"):
        graph = pkl.load(f)
    return graph

+
 if __name__ == "__main__":
    # execute only if run as a script
-    get_graph(path_nodes="cora/cora.content", path_edges="cora/cora.cites")
+    get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
+    get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
--- a/EP/Cora_node_classification/cora.py
+++ b/EP/Cora_node_classification/cora.py
+"""
+Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
+Initialize Embeddings for n dimensions with initialize-module.
+Arrays are initialized in normal or uniform random format (default = normal).
+
+
+#Usage
+get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
+-> return graph with nodes and edges
+To write the graph informations in file:
+def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/")
+To write the dictionary with initalizing Embeddings in file:
+def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/")
+"""
+
+import networkx as nx
+import numpy as np
+import initialize
+import pickle as pkl
+
+def list_of_classes():
+    return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]
+
+def read_file_and_get_nodes(graph_name, path="/home/utaemon/SP/cora/cora.content"):
+    class_list = list_of_classes()
+    max_bow_len = 0
+    node_mapping = {}
+    length_array = np.empty((0))
+    with open(path) as file:
+        counter = 0
+        for line in file.readlines():
+            split_line = np.array(line.split())
+            paper_id, split_line = split_line[0], split_line[1:]
+            node_mapping[paper_id] = counter
+            paper_id = counter
+            counter += 1
+            paper_class, all_bow_of_paper = split_line[-1], split_line[:-1]
+            paper_bow = np.where(all_bow_of_paper == "1")               #get indices which words occur
+            length_array = np.append(length_array, len(paper_bow[0]))
+            if len(paper_bow[0]) > max_bow_len:
+                max_bow_len = len(paper_bow[0])
+            paper_class = class_list.index(paper_class)                 #get index of class to numeralize
+            #add infos to Graph
+            graph_name.add_node(int(paper_id))
+            graph_name.node[int(paper_id)]["class"] = paper_class
+            graph_name.node[int(paper_id)]["bow"] = paper_bow[0]
+            graph_name.node[int(paper_id)]["paper_id"] = [paper_id]
+    graph_name.graph["bow"]={"maxlen": max_bow_len, "vocab": 1433, "lengths": length_array}
+    graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
+    return node_mapping
+
+def read_file_and_get_edges(graph_name, node_mapping, path="/home/utaemon/SP/cora/cora.cites"):
+    with open(path) as file:
+        for line in file.readlines():
+            a, b = line.split()
+            graph_name.add_edge(node_mapping[a],node_mapping[b])
+
+#---------------------create graph--------------
+
+
+def get_graph(path_nodes, path_edges):
+    Cora_graph = nx.Graph()
+    node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes)
+    read_file_and_get_edges(Cora_graph, node_mapping, path_edges)
+    return Cora_graph
+
+#----------for getting maxima--------
+def get_neighbours_dict(path_nodes, path_edges):
+    dict_neighbours = {}
+    for edge in get_graph(path_nodes, path_edges).edges:
+        if edge[0] not in dict_neighbours:
+            dict_neighbours[edge[0]] = {edge[1]}
+        else:
+            dict_neighbours[edge[0]].add(edge[1])
+        if edge[1] not in dict_neighbours:
+            dict_neighbours[edge[1]] = {edge[0]}
+        else:
+            dict_neighbours[edge[1]].add(edge[0])
+    return dict_neighbours
+
+def get_node_bow_len(path_nodes, path_edges):
+    bow_len_dict = {}
+    nodes = get_graph(path_nodes, path_edges).nodes(data=True)
+    for node in nodes:
+        bow_len_dict[node[0]] = len(node[1]["bow"])             #node[0] = paper_id, node[1] contains node information
+    return bow_len_dict
+
+def get_max_neighbours(path_nodes, path_edges):
+    neighbours_dict = get_neighbours_dict(path_nodes, path_edges)
+    max_neighbours = 0
+    for key in neighbours_dict:
+        if len(neighbours_dict[key]) > max_neighbours:
+            max_neighbours = len(neighbours_dict[key])
+    return max_neighbours
+
+def get_max_bow_neighbours(path_nodes, path_edges):
+    node_bow_len_dict = get_node_bow_len(path_nodes, path_edges)
+    neighbours_dict = get_neighbours_dict(path_nodes, path_edges)
+    max_neighbour_bow_len = 0
+    for key in neighbours_dict:
+        temp_bow_max_len = 0
+        for neighbour in neighbours_dict[key]:
+            temp_bow_max_len += node_bow_len_dict[neighbour]
+        if temp_bow_max_len > max_neighbour_bow_len:
+            max_neighbour_bow_len = temp_bow_max_len
+    return max_neighbour_bow_len
+
+def add_max_values_to_graph(path_nodes, path_edges):                    #update Graph with max values
+    Cora_graph = get_graph(path_nodes, path_edges)
+    Cora_graph.graph["bow"]["maxlen_neighbours"] = get_max_bow_neighbours(path_nodes, path_edges)
+    Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
+    return Cora_graph
+
+def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
+    return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
+
+def write_pickle_graph_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/"):
+    g = add_max_values_to_graph(path_nodes, path_edges)
+    with open(path_output_graph + "graph.pkl", "wb") as output:
+        pkl.dump(g, output)
+
+def read_pickle_graph(path = "/home/utaemon/SP/graph.pkl"):
+    with open(path, 'rb') as f:
+        graph = pkl.load(f)
+    return graph
+
+
+if __name__ == "__main__":
+    # execute only if run as a script
+    get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
+    get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
--- a/EP/Cora_node_classification/cora_embeddings_uniform_m20.pkl
+++ b/EP/Cora_node_classification/cora_embeddings_uniform_m20.pkl
--- a/EP/Cora_node_classification/graph.pkl
+++ b/EP/Cora_node_classification/graph.pkl
--- a/EP/Cora_node_classification/initialize.py
+++ b/EP/Cora_node_classification/initialize.py
+"""
+Initialize Embeddings for n dimensions.
+Arrays are initialized in normal or uniform random format.
+The seed has the default value 1.
+
+#Usage
+Embeddings.normal_random(dimension = 128, seed = 1)
+Embeddings.uniform_random(dimension = 128, seed = 1)
+or to get a set of Embeddings:
+get_embeddings(rand_type="normal_random", dimension = 128, quantity=1)
+"""
+
+import numpy as np
+
+class Embeddings:
+
+    def normal_random(dimension = 128, seed = 1):
+        np.random.seed(seed)
+        return np.random.normal(size=dimension)
+
+    def uniform_random(dimension = 128, seed = 1):
+        np.random.seed(seed)
+        return np.random.uniform(size=dimension)
+
+
+def get_embeddings(rand_type="normal_random", dimension = 128, quantity=1):
+
+    emb_dict = {}
+    if rand_type == "normal_random":
+        for i in range(0, quantity):
+            emb_dict[i] = Embeddings.normal_random(dimension = dimension, seed = i)
+    elif rand_type == "uniform_random":
+        for i in range(0, quantity):
+            emb_dict[i] = Embeddings.uniform_random(dimension = dimension, seed = i)
+    return emb_dict
\ No newline at end of file
--- a/EP/node_classification.py
+++ b/EP/node_classification.py
@@ -18,7 +18,7 @@ import pickle as pkl
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import random
-import random_nodes
+import random_nodes_for_node_classification
 import sys
 import argparse

@@ -121,7 +121,7 @@ if __name__ == "__main__":
    parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.")
    parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training")
    parser.add_argument("-t", "--testset", type=int, default = 1000, help="number of random instances in testset")
-    parser.add_argument("-c", "--regularization", type=int|float, default=0.1, help="Inverse of regularization strength")
+    parser.add_argument("-c", "--regularization", type=float, default=0.1, help="Inverse of regularization strength")
    args = parser.parse_args()
    if "-s" in sys.argv[1:]:
        node_classification(path_graph=args.graph, path_embeddings=args.embeddings, seed= args.seed, num_per_class=args.number, C=args.regularization)

--- a/EP/random_nodes_for_node_classification.py
+++ b/EP/random_nodes_for_node_classification.py
--- a/EP/__pycache__/cora.cpython-35.pyc
+++ b/EP/__pycache__/cora.cpython-35.pyc
--- a/EP/__pycache__/initialize.cpython-35.pyc
+++ b/EP/__pycache__/initialize.cpython-35.pyc
--- a/EP/__pycache__/random_nodes_for_node_classification.cpython-35.pyc
+++ b/EP/__pycache__/random_nodes_for_node_classification.cpython-35.pyc