Skip to content
Snippets Groups Projects
Commit 11c63b87 authored by weber's avatar weber
Browse files

Merge remote-tracking branch 'origin/master'

parents 244b64c4 95fcf1c4
No related branches found
No related tags found
No related merge requests found
Showing
with 55 additions and 166169 deletions
"""
Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
Initialize Embeddings for n dimensions with initialize-module.
Arrays are initialized in normal or uniform random format (default = normal).
#Usage
get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
-> return graph with nodes and edges
To write the graph informations in file:
def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/")
To write the dictionary with initalizing Embeddings in file:
def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/")
"""
import networkx as nx
import numpy as np
import initialize
import pickle as pkl
def list_of_classes():
return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]
def read_file_and_get_nodes(graph_name, path="/home/utaemon/SP/cora/cora.content"):
class_list = list_of_classes()
max_bow_len = 0
node_mapping = {}
length_array = np.empty((0))
with open(path) as file:
counter = 0
for line in file.readlines():
split_line = np.array(line.split())
paper_id, split_line = split_line[0], split_line[1:]
node_mapping[paper_id] = counter
paper_id = counter
counter += 1
paper_class, all_bow_of_paper = split_line[-1], split_line[:-1]
paper_bow = np.where(all_bow_of_paper == "1") #get indices which words occur
length_array = np.append(length_array, len(paper_bow[0]))
if len(paper_bow[0]) > max_bow_len:
max_bow_len = len(paper_bow[0])
paper_class = class_list.index(paper_class) #get index of class to numeralize
#add infos to Graph
graph_name.add_node(int(paper_id))
graph_name.node[int(paper_id)]["class"] = paper_class
graph_name.node[int(paper_id)]["bow"] = paper_bow[0]
graph_name.node[int(paper_id)]["paper_id"] = [paper_id]
graph_name.graph["bow"]={"maxlen": max_bow_len, "vocab": 1433, "lengths": length_array}
graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
return node_mapping
def read_file_and_get_edges(graph_name, node_mapping, path="/home/utaemon/SP/cora/cora.cites"):
with open(path) as file:
for line in file.readlines():
a, b = line.split()
graph_name.add_edge(node_mapping[a],node_mapping[b])
#---------------------create graph--------------
def get_graph(path_nodes, path_edges):
Cora_graph = nx.Graph()
node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes)
read_file_and_get_edges(Cora_graph, node_mapping, path_edges)
return Cora_graph
#----------for getting maxima--------
def get_neighbours_dict(path_nodes, path_edges):
dict_neighbours = {}
for edge in get_graph(path_nodes, path_edges).edges:
if edge[0] not in dict_neighbours:
dict_neighbours[edge[0]] = {edge[1]}
else:
dict_neighbours[edge[0]].add(edge[1])
if edge[1] not in dict_neighbours:
dict_neighbours[edge[1]] = {edge[0]}
else:
dict_neighbours[edge[1]].add(edge[0])
return dict_neighbours
def get_node_bow_len(path_nodes, path_edges):
bow_len_dict = {}
nodes = get_graph(path_nodes, path_edges).nodes(data=True)
for node in nodes:
bow_len_dict[node[0]] = len(node[1]["bow"]) #node[0] = paper_id, node[1] contains node information
return bow_len_dict
def get_max_neighbours(path_nodes, path_edges):
neighbours_dict = get_neighbours_dict(path_nodes, path_edges)
max_neighbours = 0
for key in neighbours_dict:
if len(neighbours_dict[key]) > max_neighbours:
max_neighbours = len(neighbours_dict[key])
return max_neighbours
def get_max_bow_neighbours(path_nodes, path_edges):
node_bow_len_dict = get_node_bow_len(path_nodes, path_edges)
neighbours_dict = get_neighbours_dict(path_nodes, path_edges)
max_neighbour_bow_len = 0
for key in neighbours_dict:
temp_bow_max_len = 0
for neighbour in neighbours_dict[key]:
temp_bow_max_len += node_bow_len_dict[neighbour]
if temp_bow_max_len > max_neighbour_bow_len:
max_neighbour_bow_len = temp_bow_max_len
return max_neighbour_bow_len
def add_max_values_to_graph(path_nodes, path_edges): #update Graph with max values
Cora_graph = get_graph(path_nodes, path_edges)
Cora_graph.graph["bow"]["maxlen_neighbours"] = get_max_bow_neighbours(path_nodes, path_edges)
Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
return Cora_graph
def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
def write_pickle_graph_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/"):
g = add_max_values_to_graph(path_nodes, path_edges)
with open(path_output_graph + "graph.pkl", "wb") as output:
pkl.dump(g, output)
def read_pickle_graph(path = "/home/utaemon/SP/graph.pkl"):
with open(path, 'rb') as f:
graph = pkl.load(f)
return graph
if __name__ == "__main__":
# execute only if run as a script
get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
Nodes per Class
0
298
1
418
2
818
3
426
4
217
5
180
6
351
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Utaemon Toyota
@project: Software Projekt
@date: 25.12.2018
"""
import random
import cora
def get_graph(path_nodes, path_edges):
return cora.get_graph(path_nodes, path_edges)
def dict_of_node_classes(path_nodes, path_edges):
cora = get_graph(path_nodes, path_edges)
class_dict = {}
for node in cora.node:
node_class = cora.node[node]["class"]
if not node_class in class_dict:
class_dict[node_class] = set()
class_dict[node_class].add(node)
return class_dict
def get_random_num_nodes(set_elm, num):
return set(random.sample(set_elm, num))
def get_num_random_nodes_for_all_classes(num = 3, path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites"):
"""get specific number of nodes per class, same number for all classes"""
cora_dict = dict_of_node_classes(path_nodes, path_edges)
sampled_random_id_set = set()
for key in cora_dict:
for id in get_random_num_nodes(cora_dict[key], num):
sampled_random_id_set.add(id)
# if you want to return dict of class:random_ids
# cora_dict = dict_of_node_classes(path_nodes, path_edges)
# sampled_random_dict = {}
# for key in cora_dict:
# sampled_random_dict[key] = get_random_num_nodes(cora_dict[key], num)
# return sampled_random_dict
return sampled_random_id_set
#get random nodes
def get_num_of_random_nodes(num = 3, path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites"):
"""Get random nodes."""
cora_nodes = set(get_graph(path_nodes, path_edges).nodes)
return set(random.sample(cora_nodes, num))
#get a dictionary with class: ids
def get_dict_of_nodes(*args):
"""*args: Tuple of Set_name and number of nodes in it, for example ("a", 10)."""
dict_of_sets = {}
for arg in args:
ids = get_num_of_random_nodes(arg[1])
dict_of_sets[arg[0]] = ids
return dict_of_sets
#get features from nodes
def get_features_for_nodes(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites"):
cora_graph = get_graph(path_nodes, path_edges)
set_of_cora_nodes = get_num_of_random_nodes()
dict_of_id_and_features = {}
for id in set_of_cora_nodes:
dict_of_id_and_features[id] = cora_graph.node[id]
return dict_of_id_and_features
#!/usr/bin/env python3
"""
@author: Utaemon Toyota
@project: Software Projekt
@date: 14.01.2018
"""
import cora
import networkx as nx
import pickle as pkl
def get_graph(graph_path):
cora_graph = cora.read_pickle_graph(graph_path)
return cora_graph
def get_start_node(graph):
li_nodes = graph.nodes
return list(li_nodes)[0]
def get_neighbours(nb):
return list(dict(nb).keys())
def get_subgraph(graph_path):
cora_graph = get_graph(graph_path)
start = get_start_node(cora_graph)
start_nb = get_neighbours(cora_graph[start])
temp_node_set = set()
temp2 = set()
for node in start_nb:
temp_node_set.add(node)
for node in temp_node_set:
for elm in get_neighbours(cora_graph[node]):
temp2.add(elm)
temp3 = temp_node_set.union(temp2)
cora_test = nx.Graph(cora_graph.subgraph(list(temp3)))
return cora_test
def write_test_pickle(output = "test_cora.pkl", graph_path = "/home/utaemon/SP/cora_graph.pkl"):
with open (output, "wb") as output:
pkl.dump(get_subgraph(graph_path), output)
#write_test_pickle()
#cora_test = cora.read_pickle_graph("test_cora.pkl")
#print (cora_test.edges)
File deleted
This diff is collapsed.
"""
Initialize Embeddings for n dimensions.
Arrays are initialized in normal or uniform random format.
The seed has the default value 1.
#Usage
Embeddings.normal_random(dimension = 128, seed = 1)
Embeddings.uniform_random(dimension = 128, seed = 1)
or to get a set of Embeddings:
get_embeddings(rand_type="normal_random", dimension = 128, quantity=1)
"""
import numpy as np
class Embeddings:
def normal_random(dimension = 128, seed = 1):
np.random.seed(seed)
return np.random.normal(size=dimension)
def uniform_random(dimension = 128, seed = 1):
np.random.seed(seed)
return np.random.uniform(size=dimension)
def get_embeddings(rand_type="normal_random", dimension = 128, quantity=1):
emb_dict = {}
if rand_type == "normal_random":
for i in range(0, quantity):
emb_dict[i] = Embeddings.normal_random(dimension = dimension, seed = i)
elif rand_type == "uniform_random":
for i in range(0, quantity):
emb_dict[i] = Embeddings.uniform_random(dimension = dimension, seed = i)
return emb_dict
\ No newline at end of file
This diff is collapsed.
File deleted
This diff is collapsed.
# AUTHORS
Lyuba Dimitrova, Nadia Arslan, Nicolas Weber, Utaemon Toyota
# PROJECT
Softwareprojekt WS2018/19
Betreuerin: Prof. Dr. Anette Frank
Graph Embedding Propagation
# Cora Node Classification
To evaluate the trained graph and the embeddings the task of node classification will be executed. First, the data of cora will be imported into a networkX graph, which will be saved in a pickle file to use it for the training of the embeddings with our EP-SP algorithm. Afterwards the trained embedding will be evaluated with LibLinear L2-Logistic Regression provided from sklearn.
# Required Data
- Cora dataset saved in cora_data for building the graph
- Embeddings for node classification
# Dependencies
For cora.py
-networkx for building the graph
-numpy to save one-hot vocabulary vectors
-pickle to save data in a pickle file
For node_classification.py
-cora.py
-random_nodes_for_node_classification.py for getting random node sets for test, training and validation sets
-pickle
-numpy
-sklearn for evaluation
-random for getting random test, trainings and validation sets
-sys
-argparse
-heapq for getting a heatmap from confusion matrix
-sklearn for confusion matrix and f1 score
# Running instructions
For cora.py
...
For node_classification.py
python3 node_classification.py [-g] [-e] [-s] [-i] [-n]
-g / --graph Path to pickled networkX-graph
-e / --embeddings Path to pickled embeddings
-s / --seed Seed for randomization. If this argument is called, only a node classification for this specific seed will be executed
-i / --iterations Number of iterations of node classification. Counter of iteration is equal to random seed
-n / --number Number of instances per class for training
-c / --regularization Inverse of regularization strength
"""
@info
Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
Initialize Embeddings for n dimensions with initialize-module.
Arrays are initialized in normal or uniform random format (default = normal).
#Usage
get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
@usage
get_graph(path_nodes="/cora_data/cora.content", path_edges="/cora_data/cora.cites")
-> return graph with nodes and edges
To write the graph informations in file:
def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/")
To write the dictionary with initalizing Embeddings in file:
def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/")
def write_graph_to_file(path_nodes="/cora_data/cora.content", path_edges="/cora_data/cora.cites", path_output_graph = "")
"""
import networkx as nx
import numpy as np
import initialize
import pickle as pkl
def list_of_classes():
return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]
def read_file_and_get_nodes(graph_name, path="/home/utaemon/SP/cora/cora.content"):
def read_file_and_get_nodes(graph_name, path="/cora_data/cora.content"):
class_list = list_of_classes()
max_bow_len = 0
node_mapping = {}
......@@ -49,7 +44,7 @@ def read_file_and_get_nodes(graph_name, path="/home/utaemon/SP/cora/cora.content
graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
return node_mapping
def read_file_and_get_edges(graph_name, node_mapping, path="/home/utaemon/SP/cora/cora.cites"):
def read_file_and_get_edges(graph_name, node_mapping, path="/cora_data/cora.cites"):
with open(path) as file:
for line in file.readlines():
a, b = line.split()
......@@ -111,15 +106,12 @@ def add_max_values_to_graph(path_nodes, path_edges): #update
Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
return Cora_graph
def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
def write_pickle_graph_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/"):
def write_pickle_graph_file(path_nodes="/cora_data/cora.content", path_edges="/cora_data/cora.cites", path_output_graph = ""):
g = add_max_values_to_graph(path_nodes, path_edges)
with open(path_output_graph + "graph.pkl", "wb") as output:
pkl.dump(g, output)
def read_pickle_graph(path = "/home/utaemon/SP/graph.pkl"):
def read_pickle_graph(path = "graph.pkl"):
with open(path, 'rb') as f:
graph = pkl.load(f)
return graph
......@@ -127,5 +119,5 @@ def read_pickle_graph(path = "/home/utaemon/SP/graph.pkl"):
if __name__ == "__main__":
# execute only if run as a script
get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
get_graph(path_nodes="/cora_data/cora.content", path_edges="/cora_data/cora.cites")
get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
"""
Initialize Embeddings for n dimensions.
Arrays are initialized in normal or uniform random format.
The seed has the default value 1.
#Usage
Embeddings.normal_random(dimension = 128, seed = 1)
Embeddings.uniform_random(dimension = 128, seed = 1)
or to get a set of Embeddings:
get_embeddings(rand_type="normal_random", dimension = 128, quantity=1)
"""
import numpy as np
class Embeddings:
def normal_random(dimension = 128, seed = 1):
np.random.seed(seed)
return np.random.normal(size=dimension)
def uniform_random(dimension = 128, seed = 1):
np.random.seed(seed)
return np.random.uniform(size=dimension)
def get_embeddings(rand_type="normal_random", dimension = 128, quantity=1):
emb_dict = {}
if rand_type == "normal_random":
for i in range(0, quantity):
emb_dict[i] = Embeddings.normal_random(dimension = dimension, seed = i)
elif rand_type == "uniform_random":
for i in range(0, quantity):
emb_dict[i] = Embeddings.uniform_random(dimension = dimension, seed = i)
return emb_dict
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
......@@ -4,6 +4,7 @@
@author: Utaemon Toyota
@date: 31.1.2019
@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
@requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data
@usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n]
-g / --graph Path to pickled networkX-graph
-e / --embeddings Path to pickled embeddings
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment