Skip to content
Snippets Groups Projects
Commit 384423aa authored by toyota's avatar toyota
Browse files

adapt paths

parent ba50f26c
No related branches found
No related tags found
No related merge requests found
LICENSE 0 → 100644
This software is distributed under the MIT License.
MIT License
Copyright (c) 2019 Nadia Arslan, Lyuba Dimitrova, Utaemon Toyota, Nicolas Weber
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
#!/usr/bin/env python3
"""
@author: Utaemon Toyota
@date: 31.1.2019
@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
@requirements: cora.py and random_nodes_for_node_classification.py as well as the cora data
@usage: python3 node_classification.py [-g] [-e] [-s] [-i] [-n]
......@@ -62,7 +60,7 @@ def get_random_num_nodes(set_elm, num, seed):
random.seed(seed)
return set(random.sample(set_elm, num))
def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed = 1):
def get_num_random_nodes_for_all_classes_read(path = "/../../data/cora/graph/cora_graph.pkl", num = 20, seed = 1):
"""get specific number of nodes per class, same number for all classes"""
cora_dict = dict_of_node_classes_read(path)
sampled_random_id_set = set()
......@@ -73,7 +71,7 @@ def get_num_random_nodes_for_all_classes_read(path = "graph.pkl", num = 20, seed
#------------------------classification
def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"):
def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/cora_graph.pkl", path_emb = "/../../data/cora/embeddings/merged_node_embeddings.pkl"):
training_nodes = training(path_graph, seed=seed, num = num)
emb = get_embeddings(path_emb)
cl = get_class_list(path_graph)
......@@ -100,7 +98,7 @@ def classify(C, seed = 0, num = 20, num_test_instances = 1000, path_graph = "gra
print (score)
return score
def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "graph.pkl", path_emb = "cora_embeddings_uniform_m20.pkl"):
def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_graph = "/../../data/cora/graph/cora_graph.pkl", path_emb = "/../../data/cora/embeddings/merged_node_embeddings.pkl"):
C_li = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
for i in range(range_seeds):
print ("Iteration/Random Seed:", i)
......@@ -108,7 +106,7 @@ def classify_func(range_seeds = 10, num = 20, num_test_instances = 1000, path_gr
classify(C, seed=i, num = num, num_test_instances = num_test_instances, path_graph = path_graph, path_emb = path_emb)
#------------------------Node Classification
def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1):
def node_classification(path_graph = "/../../data/cora/graph/cora_graph.pkl", path_embeddings = "/../../data/cora/embeddings/merged_node_embeddings.pkl", num_test_instances = 1000, seed=20, num_per_class = 20, C = 0.1):
logisticRegr = LogisticRegression(C=C, solver='liblinear', multi_class='ovr')
training_nodes = training(path_graph, seed=seed, num = num_per_class)
emb = get_embeddings(path_embeddings)
......@@ -140,7 +138,7 @@ def node_classification(path_graph = "graph.pkl", path_embeddings = "cora_embedd
#print ("Confusion Matrix:\n", conf_matrix)
return score_macro, conf_matrix
def node_classification_random_seeds(path_graph = "graph.pkl", path_embeddings = "cora_embeddings_uniform_m20.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1):
def node_classification_random_seeds(path_graph = "/../../data/cora/graph/cora_graph.pkl", path_embeddings = "/../../data/cora/embeddings/merged_node_embeddings.pkl", num_test_instances = 1000, num_per_class = 20, iterations = 50, C = 0.1):
scores = []
for i in range (0,iterations):
scores.append(node_classification(path_graph = path_graph, path_embeddings = path_embeddings, num_test_instances = num_test_instances, seed=i, num_per_class = num_per_class, C=C)[0])
......@@ -160,8 +158,8 @@ if __name__ == "__main__":
# execute only if run as a script
parser = argparse.ArgumentParser(description="Node Classification script.")
parser.add_argument("-g", "--graph", default = "graph.pkl", help="path to graph")
parser.add_argument("-e", "--embeddings", default = "cora_embeddings_uniform_m20.pkl", help="path to embeddings")
parser.add_argument("-g", "--graph", default = "/../../data/cora/graph/cora_graph.pkl", help="path to graph")
parser.add_argument("-e", "--embeddings", default = "/../../data/cora/embeddings/merged_node_embeddings.pkl", help="path to embeddings")
parser.add_argument("-s", "--seed", type=int, help="random seed for one node classification. If this will be specified, always the function node_classification() will be executed.")
parser.add_argument("-i", "--iterations", type=int, default = 10, help="number of iterations of node classification. Counter of iteration is random seed.")
parser.add_argument("-n", "--number", type=int, default = 20, help="number of instances per class for training")
......
"""
Getting a networkx graph from Cora. Graph can be saved in txt file. CARE: numpy-arrays are converted to lists due to errors (NumPy array is not JSON serializable).
Initialize Embeddings for n dimensions with initialize-module.
Arrays are initialized in normal or uniform random format (default = normal).
#Usage
get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
-> return graph with nodes and edges
To write the graph informations in file:
def write_graph_to_file(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites", path_output_graph = "/home/utaemon/SP/")
To write the dictionary with initalizing Embeddings in file:
def write_dict_to_file(rand_type="normal_random", dimension = 128, quantity=1433, path_output_emb = "/home/utaemon/SP/")
@project: Software Projekt @ Heidelberg University, Institute for Computational Linguistics
@requirements: cora data, numpy, networkX, pickle
@info
Getting a networkx graph from Cora. Graph will be saved in a pickle file.
@usage
python3 cora.py [-n] [-e] [-o]
-n / --nodes Path to cora file containing nodes
-e / --edges Path to cora file containing edges
-o / --output Path where the graph should be saved
"""
import argparse
import networkx as nx
import numpy as np
import pickle as pkl
import os
def list_of_classes():
return ["Case_Based", "Genetic_Algorithms", "Neural_Networks", "Probabilistic_Methods", "Reinforcement_Learning", "Rule_Learning", "Theory"]
def read_file_and_get_nodes(graph_name, path):
def read_file_and_get_nodes(graph_name, path="/../../data/cora/raw/cora.content"):
class_list = list_of_classes()
max_bow_len = 0
node_mapping = {}
......@@ -49,7 +47,7 @@ def read_file_and_get_nodes(graph_name, path):
graph_name.graph["paper_id"] = {"maxlen": 1, "vocab": (len(graph_name)), "lengths": np.ones(len(graph_name))}
return node_mapping
def read_file_and_get_edges(graph_name, node_mapping, path):
def read_file_and_get_edges(graph_name, node_mapping, path="/../../data/cora/raw/cora.cites"):
with open(path) as file:
for line in file.readlines():
a, b = line.split()
......@@ -57,7 +55,6 @@ def read_file_and_get_edges(graph_name, node_mapping, path):
#---------------------create graph--------------
def get_graph(path_nodes, path_edges):
Cora_graph = nx.Graph()
node_mapping = read_file_and_get_nodes(Cora_graph, path_nodes)
......@@ -111,26 +108,21 @@ def add_max_values_to_graph(path_nodes, path_edges): #update
Cora_graph.graph["paper_id"]["maxlen_neighbours"] = get_max_neighbours(path_nodes, path_edges)
return Cora_graph
# not used, initialization happens in EP
'''
def get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433):
return initialize.get_embeddings(rand_type=rand_type, dimension = dimension, quantity=quantity)
'''
def write_pickle_graph_file(path_nodes, path_edges, output_path):
def write_pickle_graph_file(path_nodes="/../../data/cora/raw/cora.content", path_edges="/../../data/cora/raw/cora.cites", path_output_graph = "/../../data/cora/graph/"):
g = add_max_values_to_graph(path_nodes, path_edges)
path = os.path.split(output_path)[0]
if not os.path.exists(path):
os.mkdir(path)
with open(output_path, "wb") as output:
with open(path_output_graph + "cora_graph.pkl", "wb") as output:
pkl.dump(g, output)
def read_pickle_graph(path = "graph.pkl"): #will be used on node_classification.py for accessing the graph
with open(path, 'rb') as f:
graph = pkl.load(f)
return graph
'''
if __name__ == "__main__":
# execute only if run as a script
get_graph(path_nodes="/home/utaemon/SP/cora/cora.content", path_edges="/home/utaemon/SP/cora/cora.cites")
# get_init_emb(rand_type="normal_random", dimension = 128, quantity=1433)
'''
\ No newline at end of file
parser = argparse.ArgumentParser(description="Skript for building cora graph.")
parser.add_argument("-n", "--nodes", default="/../../data/cora/raw/cora.content", help="path to file containing cora nodes")
parser.add_argument("-e", "--edges", default="/../../data/cora/raw/cora.cites", help="path to file containing edges/citations")
parser.add_argument("-o", "--output", default="/../../data/cora/graph/", help="path where the graph should be saved")
args = parser.parse_args()
write_pickle_graph_file(path_nodes=args.nodes, path_edges=args.edges, path_output_graph=args.output)
# AUTHORS
Lyuba Dimitrova, Nadia Arslan, Nicolas Weber, Utaemon Toyota
# PROJECT
Softwareprojekt WS2018/19
Betreuerin: Prof. Dr. Anette Frank
Graph Embedding Propagation
# Senseval Preprocessing for Method 1
This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 1. The skript will produce json-files for SensEval-2 and 3. This files include sentence splitted lists with lemmatized lowered words in a tuple together with the according WordNet3.0 POS-tag.
# Senseval Preprocessing for Method 2
This is an implementation to provide preprocessed data for our Word Sense Disambiguation Method 2. The skript will produce pkl-files for each document in Senseval2/3 named as the document name.
From provided Senseval-english-allword-test-data and their Penntree Bank annotations only the useful information will be filtered out. Lemmas which are not included in glossmappings or listed in stopwords will be deleted. For multiword-expressions, only the tag for the head-token will be saved. Information about their satellites will be discarded.
The resulting pickle file contains 2 lists. The first one contains information about lemma and their tag in a list: [lemma, Penntreebank-tag, wordnet-tag, spacy-tag]. The second one contains the information, if it is a head, a satellite or None: ['head', {'id': ['d000.s000.t001']}].
# Provided data
Senseval2
- Senseval 2 english-all-words test data
- Senseval 2 Penntree Bank data for the test documents (wsj_0089.mrg, wsj_0465.mrg, wsj_1286.mrg)
- Results / Gold mappings for Senseval2
Senseval3
- Senseval 3 english-all-words test data
- Senseval 3 Penntree Bank data for the test documents (cl23.mrg. wsj_1695.mrg, wsj_1778.mrg)
- Results / Gold mappings for Senseval3
gloss_mapping.txt
- Copied from WordNet_Preprocessing
stopwords.txt
- includes stopwords, which will be filtered out
Python3 skripts
- senseval_preprocessing.py
- preprocess_senseval_method1.py
## Dependencies
re - for regular expression matching
json - for saving the results for WSD method 1
pickle - for saving the resulting lists in a pkl-file for WSD method 2
nltk - WordNetLemmatizer from NLTK for lemmatizing
## Running Instructions Method 1
python[3] preprocess_senseval_method1.py
## Running Instructions Method 2
python[3] senseval_preprocessing.py [-s] [-g] [-v]
-s / --stopwords Path to txt-file with stopwords
-g / --gloss Path to txt-file with gloss mappings
-v / --version valid input: 2 or 3 for senseval 2 / 3
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment