Skip to content
Snippets Groups Projects
Commit 7ed4414f authored by toyota's avatar toyota
Browse files

change paths

parent 13518fdc
No related branches found
No related tags found
No related merge requests found
...@@ -22,7 +22,7 @@ import pickle as pkl ...@@ -22,7 +22,7 @@ import pickle as pkl
def list_of_classes(): def list_of_classes():
return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"] return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]
def read_file_and_get_nodes(graph_name, path="/cora_data/cora.content"): def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"):
class_list = list_of_classes() class_list = list_of_classes()
max_bow_len = 0 max_bow_len = 0
node_mapping = {} node_mapping = {}
...@@ -50,7 +50,7 @@ def read_file_and_get_nodes(graph_name, path="/cora_data/cora.content"): ...@@ -50,7 +50,7 @@ def read_file_and_get_nodes(graph_name, path="/cora_data/cora.content"):
graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))} graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
return node_mapping return node_mapping
def read_file_and_get_edges(graph_name, node_mapping, path="/cora_data/cora.cites"): def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"):
with open(path) as file: with open(path) as file:
for line in file.readlines(): for line in file.readlines():
a, b = line.split() a, b = line.split()
...@@ -112,7 +112,7 @@ def add_max_values_to_graph(path_nodes, path_edges): #update ...@@ -112,7 +112,7 @@ def add_max_values_to_graph(path_nodes, path_edges): #update
Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges) Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
return Cora_graph return Cora_graph
def write_pickle_graph_file(path_nodes="/cora_data/cora.content", path_edges="/cora_data/cora.cites", path_output_graph = ""): def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"):
g = add_max_values_to_graph(path_nodes, path_edges) g = add_max_values_to_graph(path_nodes, path_edges)
with open(path_output_graph + "graph.pkl", "wb") as output: with open(path_output_graph + "graph.pkl", "wb") as output:
pkl.dump(g, output) pkl.dump(g, output)
...@@ -125,8 +125,8 @@ def read_pickle_graph(path = "graph.pkl"): #will be used on node_classificati ...@@ -125,8 +125,8 @@ def read_pickle_graph(path = "graph.pkl"): #will be used on node_classificati
if __name__ == "__main__": if __name__ == "__main__":
# execute only if run as a script # execute only if run as a script
parser = argparse.ArgumentParser(description="Skript for building cora graph.") parser = argparse.ArgumentParser(description="Skript for building cora graph.")
parser.add_argument("-n", "--nodes", default="/cora_data/cora.content", help="path to file containing cora nodes") parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes")
parser.add_argument("-e", "--edges", default="/home/utaemon/SP/cora/cora.cites", help="path to file containing edges/citations") parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations")
parser.add_argument("-o", "--output", default="", help="path where the graph should be saved") parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved")
args = parser.parse_args() args = parser.parse_args()
write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output) write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output)
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
@author: Utaemon Toyota
@date: 31.1.2019
@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics @project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
@requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data @requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data
@usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n] @usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n]
...@@ -45,7 +43,7 @@ def get_class_list(path): ...@@ -45,7 +43,7 @@ def get_class_list(path):
return np.array(class_list) return np.array(class_list)
#------------------------classification #------------------------classification
def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"): def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/graph.pkl", path_emb = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl"):
training_nodes = training(path_graph, seed=seed, num = num) training_nodes = training(path_graph, seed=seed, num = num)
emb = get_embeddings(path_emb) emb = get_embeddings(path_emb)
cl = get_class_list(path_graph) cl = get_class_list(path_graph)
...@@ -72,7 +70,7 @@ def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "gra ...@@ -72,7 +70,7 @@ def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "gra
print (score) print (score)
return score return score
def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"): def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl"):
C_li = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0] C_li = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
for i in range(range_seeds): for i in range(range_seeds):
print ("Iteration/Random Seed:", i) print ("Iteration/Random Seed:", i)
...@@ -80,7 +78,7 @@ def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_gr ...@@ -80,7 +78,7 @@ def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_gr
classify(C, seed=i, num = num, num_test_instances = num_test_instances, path_graph = path_graph, path_emb = path_emb) classify(C, seed=i, num = num, num_test_instances = num_test_instances, path_graph = path_graph, path_emb = path_emb)
#------------------------Node Classification #------------------------Node Classification
def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1): def node_classification(path_graph = "/../../data/cora/graph/graph.pkl", path_embeddings = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1):
logisticRegr = LogisticRegression(C=C, solver='liblinear', multi_class='ovr') logisticRegr = LogisticRegression(C=C, solver='liblinear', multi_class='ovr')
training_nodes = training(path_graph, seed=seed, num = num_per_class) training_nodes = training(path_graph, seed=seed, num = num_per_class)
emb = get_embeddings(path_embeddings) emb = get_embeddings(path_embeddings)
...@@ -113,7 +111,7 @@ def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embedd ...@@ -113,7 +111,7 @@ def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embedd
#print ("Confusion Matrix:\n", conf_matrix) #print ("Confusion Matrix:\n", conf_matrix)
return score_macro, conf_matrix return score_macro, conf_matrix
def node_classification_random_seeds(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1): def node_classification_random_seeds(path_graph = "/../../data/cora/graph/graph.pkl", path_embeddings = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1):
scores = [] scores = []
for i in range (0,iterations): for i in range (0,iterations):
scores.append(node_classification(path_graph = path_graph, path_embeddings = path_embeddings, num_test_instances = num_test_instances, seed=i, num_per_class = num_per_class, C=C)[0]) scores.append(node_classification(path_graph = path_graph, path_embeddings = path_embeddings, num_test_instances = num_test_instances, seed=i, num_per_class = num_per_class, C=C)[0])
...@@ -133,8 +131,8 @@ if __name__ == "__main__": ...@@ -133,8 +131,8 @@ if __name__ == "__main__":
# execute only if run as a script # execute only if run as a script
parser = argparse.ArgumentParser(description="Node Classification script.") parser = argparse.ArgumentParser(description="Node Classification script.")
parser.add_argument("-g", "--graph", default = "graph.pkl", help="path to graph") parser.add_argument("-g", "--graph", default = "/../../data/cora/graph/graph.pkl", help="path to graph")
parser.add_argument("-e", "--embeddings", default = "cora_embeddings_uniform_m20.pkl", help="path to embeddings") parser.add_argument("-e", "--embeddings", default = "/../../data/cora/embeddings/cora_embeddings_uniform_m20.pkl", help="path to embeddings")
parser.add_argument("-s", "--seed", type=int, help="random seed for one node classification. If this will be specified, always the function node_classification() will be executed.") parser.add_argument("-s", "--seed", type=int, help="random seed for one node classification. If this will be specified, always the function node_classification() will be executed.")
parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.") parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.")
parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training") parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training")
......
...@@ -27,7 +27,7 @@ def get_random_num_nodes(set_elm, num, seed): ...@@ -27,7 +27,7 @@ def get_random_num_nodes(set_elm, num, seed):
random.seed(seed) random.seed(seed)
return set(random.sample(set_elm, num)) return set(random.sample(set_elm, num))
def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed = 1): def get_num_random_nodes_for_all_classes_read(path = "/../../data/cora/graph/graph.pkl", num = 20, seed = 1):
"""get specific number of nodes per class, same number for all classes""" """get specific number of nodes per class, same number for all classes"""
cora_dict = dict_of_node_classes_read(path) cora_dict = dict_of_node_classes_read(path)
sampled_random_id_set = set() sampled_random_id_set = set()
......
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
@author: Utaemon Toyota
@date: 25.2.2019
@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics @project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
@members: Nadia Arslan, Lyuba Dimitrova, Nicolas Weber, Utaemon Toyota @members: Nadia Arslan, Lyuba Dimitrova, Utaemon Toyota, Nicolas Weber
@required data: Senseval english-all-word test data and their penn treebank files in the same directory. @required data: Senseval english-all-word test data and their penn treebank files in the same directory.
@usage: python3 senseval_preprocessing.py [-s] [-g] [-v] @usage: python3 senseval_preprocessing.py [-s] [-g] [-v]
-s / --stopwords Path to txt-file with stopwords -s / --stopwords Path to txt-file with stopwords
...@@ -18,11 +16,11 @@ import pickle as pkl ...@@ -18,11 +16,11 @@ import pickle as pkl
from nltk.stem import WordNetLemmatizer from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer() wnl = WordNetLemmatizer()
file_path2 = "Senseval2/eng-all-words_seneval2.test.xml" #senseval2 file_path2 = "/../../data/senseval2/raw/eng-all-words_seneval2.test.xml" #senseval2
file_path3 = "Senseval3/english-all-words.xml" #senseval3 file_path3 = "/../../data/senseval3/raw/english-all-words.xml" #senseval3
tree_paths2 = {"d00": "Senseval2/wsj_0089.mrg", "d01": "Senseval2/wsj_0465.mrg", "d02": "Senseval2/wsj_1286.mrg"} #senseval2 tree_paths2 = {"d00": "/../../data/senseval2/raw/wsj_0089.mrg", "d01": "/../../data/senseval2/raw/wsj_0465.mrg", "d02": "/../../data/senseval2/raw/wsj_1286.mrg"} #senseval2
tree_paths3 = {"d000": "Senseval3/cl23.mrg", "d001": "Senseval3/wsj_1695.mrg", "d002":"Senseval3/wsj_1778.mrg"} #senseval3 tree_paths3 = {"d000": "/../../data/senseval3/raw/cl23.mrg", "d001": "/../../data/senseval3/raw/wsj_1695.mrg", "d002":"/../../data/senseval3/raw/wsj_1778.mrg"} #senseval3
def get_stopword_list(stop_path): def get_stopword_list(stop_path):
with open (stop_path, "r") as f: with open (stop_path, "r") as f:
...@@ -246,7 +244,7 @@ def get_sats(tokens, info): ...@@ -246,7 +244,7 @@ def get_sats(tokens, info):
new_info.append(info[idx]) new_info.append(info[idx])
return [new_tokens, new_info] return [new_tokens, new_info]
def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mapping.txt"): def write_pkl(version = 3, stop_path="/../../data/other/stopwords.txt", gloss_path = "/../../data/wordnet/mappings/gloss_mapping.txt"):
file_path = "" file_path = ""
tree_path = "" tree_path = ""
if version == 2: if version == 2:
...@@ -264,8 +262,8 @@ def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mappin ...@@ -264,8 +262,8 @@ def write_pkl(version = 3, stop_path="stopwords.txt", gloss_path = "gloss_mappin
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Senseval Preprocessing script.") parser = argparse.ArgumentParser(description="Senseval Preprocessing script.")
parser.add_argument("-s", "--stopwords", default="stopwords.txt", help="path to stopwords-txt-file") parser.add_argument("-s", "--stopwords", default="/../../data/other/stopwords.txt", help="path to stopwords-txt-file")
parser.add_argument("-g", "--gloss", default="gloss_mapping.txt", help = "path to gloss mapping txt-file") parser.add_argument("-g", "--gloss", default="/../../data/wordnet/mappings/gloss_mapping.txt", help = "path to gloss mapping txt-file")
parser.add_argument("-v", "--version", default = 3, help="2 or 3 for senseval version") parser.add_argument("-v", "--version", default = 3, help="2 or 3 for senseval version")
args = parser.parse_args() args = parser.parse_args()
write_pkl(version=int(args.version), stop_path=args.stopwords, gloss_path=args.gloss) write_pkl(version=int(args.version), stop_path=args.stopwords, gloss_path=args.gloss)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment