Skip to content
Snippets Groups Projects
Commit 121f9d68 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Saves graphs now.

parent ffce1462
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,7 @@ matches a list of contexts to each. The method to achieve this is a modified
reimplementation of Véronis' Hyperlex (2004).
Example:
The function can be called with the following command.:
The function can be called with the following command:
$ python3 absinth.py
......@@ -23,9 +23,15 @@ Modifiers:
"""
##########################
# Dependencies #
##########################
import sys
print('[a] Loading ' + sys.argv[0] + '.\n')
import config
import json
import networkx as nx # for visualisation
import numpy as np
import os # for reading files
......@@ -42,6 +48,9 @@ from scipy import stats
nlp = spacy.load('en') # standard english nlp
##########################
# Preprocessing #
##########################
def read_dataset(data_path: str) -> (dict, dict):
"""Collects topics.txt and results.txt.
......@@ -85,6 +94,102 @@ def read_dataset(data_path: str) -> (dict, dict):
return results, topics
##########################
# Induction #
##########################
def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
"""Induces word senses for a given topic from corpus.
Counts frequencies from corpus and search result list, builds graph from
these counts (with some filters). Root hubs (senses) are collected from
this graph.
Args:
topic_name: Target string.
result_list: List of search result (context) strings.
Returns:
A cooccurrence graph,
a list of root hub strings (senses)
and dictionary of various statistics.
"""
stat_dict = dict()
stat_dict['target'] = topic_name
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
target_string = topic_name[4:]
else:
target_string = topic_name
print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
#Check if frequencies were already counted before.
node_dict_name = topic_name+'_node.json'
edge_dict_name = topic_name+'_edge.json'
graph_in_existence = False
for graph_name in os.listdir(config.graph):
if topic_name in graph_name:
graph_in_existence = True
with open(node_dict_name, 'r') as node_file, open(edge_dict_name, 'r') as edge_file:
node_freq_dict = json.load(node_file)
edge_freq_dict = json.load(edge_file)
continue
if graph_in_existence == False:
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
with open(node_dict_name, 'w') as node_file, open(edge_dict_name, 'w') as edge_file:
node_file.write(json.dumps(node_freq_dict))
edge_file.write(json.dumps(edge_freq_dict))
#builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+topic_name+')')
graph = build_graph(node_freq_dict, edge_freq_dict)
for string in topic_name.split('_'):
if string in graph.nodes:
graph.remove_node(string)
stat_dict['nodes'] = len(graph.nodes)
stat_dict['edges'] = len(graph.edges)
#finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.\t('+topic_name+')')
root_hub_list = root_hubs(graph, edge_freq_dict)
#adds sense inventory to buffer with some common neighbors for context
stat_dict['hubs'] = dict()
for root_hub in root_hub_list:
by_frequency = lambda node: edge_freq_dict[root_hub,node] \
if root_hub < node \
else edge_freq_dict[node, root_hub]
most_frequent_neighbor_list = sorted(graph.adj[root_hub],
key=by_frequency, reverse=True)
stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]
return graph, root_hub_list, stat_dict
def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
"""Counts occurrences of nodes and cooccurrences.
......@@ -408,151 +513,16 @@ def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
return hub_list
def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
"""Builds minimum spanning tree from graph and removes singletons.
Applies components algorithm from Véronis (2004) and removes singletons.
Args:
graph: Undirected weighted graph.
root_hub_list: List of strings of root hubs of graph.
target_string: Root of minimum spanning tree.
Returns:
Minimum spanning tree with target as root and root hubs as direct
children. Singletons removed.
"""
graph_copy = deepcopy(graph)
graph_copy.add_node(target_string)
for root_hub in root_hub_list:
graph_copy.add_edge(target_string,root_hub,weight=0)
minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
return minimum_spanning_tree
def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
"""Calculate score for a given component in a minimum spanning tree.
First the correct root for the component is chosen. If no root hub is
suitable, an empty array is returned. A score is calculated for the distance
of the component and its root and returned as part of an array filled with
zeroes.
Args:
graph: Minimum spanning tree.
component: Node (string) from which the distances are to be calculated.
root_hub_list: List of strings of root hubs (senses) of original graph.
Returns:
Array with one score for the correct root hub and filled with zeroes.
"""
root_hub_count = len(root_hub_list)
#Initialise score array.
score_array = np.zeros(root_hub_count)
# Find root of component.
distance_list = list()
for root_hub in root_hub_list:
if nx.has_path(graph, component, root_hub):
distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
else:
distance_list.append(0)
if sum(distance_list) == 0:
return score_array
root_idx = np.argmax(distance_list)
root = root_hub_list[root_idx]
shortest_path = nx.shortest_path(graph, component, root, 'weight')
total_weight = 0
# Add weights of every sub-path.
for i in range(1, len(shortest_path)):
sub_from, sub_to = shortest_path[i-1], shortest_path[i]
total_weight += graph[sub_from][sub_to]['weight']
score_array = np.zeros(root_hub_count)
score_array[root_idx] = 1/(1+total_weight)
return score_array
def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
"""Induces word senses for a given topic from corpus.
Counts frequencies from corpus and search result list, builds graph from
these counts (with some filters). Root hubs (senses) are collected from
this graph.
Args:
topic_name: Target string.
result_list: List of search result (context) strings.
Returns:
A cooccurrence graph,
a list of root hub strings (senses)
and dictionary of various statistics.
"""
stat_dict = dict()
stat_dict['target'] = topic_name
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
target_string = topic_name[4:]
else:
target_string = topic_name
print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
#builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+topic_name+')')
graph = build_graph(node_freq_dict, edge_freq_dict)
for string in topic_name.split('_'):
if string in graph.nodes:
graph.remove_node(string)
stat_dict['nodes'] = len(graph.nodes)
stat_dict['edges'] = len(graph.edges)
#finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.\t('+topic_name+')')
root_hub_list = root_hubs(graph, edge_freq_dict)
#adds sense inventory to buffer with some common neighbors for context
stat_dict['hubs'] = dict()
for root_hub in root_hub_list:
by_frequency = lambda node: edge_freq_dict[root_hub,node] \
if root_hub < node \
else edge_freq_dict[node, root_hub]
most_frequent_neighbor_list = sorted(graph.adj[root_hub],
key=by_frequency, reverse=True)
stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]
return graph, root_hub_list, stat_dict
##############################
# Propagation Disambiguation #
##############################
def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
"""Colours graph accoring to root hubs.
def label_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
"""propagations graph accoring to root hubs.
Evolving network that colours neighboring nodes iterative. See sentiment
Evolving network that propagations neighboring nodes iterative. See sentiment
propagation.
Args:
......@@ -560,7 +530,7 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
root_hub_list: List of senses.
Returns:
Coloured graph.
labelled graph.
"""
......@@ -579,7 +549,7 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
graph.node[node]['sense'] = None
max_iteration_count = config.max_colour_iteration_count
max_iteration_count = config.max_propagation_iteration_count
iteration_count = 0
stable = False
......@@ -607,12 +577,12 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
graph.node[node]['dist'].append(neighbor_weight_list)
old_colour = graph_copy.node[node]['sense']
new_colour = np.argmax(np.mean(graph.node[node]['dist'], axis=0))
old_propagation = graph_copy.node[node]['sense']
new_propagation = np.argmax(np.mean(graph.node[node]['dist'], axis=0))
if old_colour != new_colour:
if old_propagation != new_propagation:
stable = False
graph.node[node]['sense'] = new_colour
graph.node[node]['sense'] = new_propagation
else:
pass
......@@ -626,12 +596,12 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
graph.node[node]['dist'] = np.mean(graph.node[node]['dist'], axis=0)
return graph
def disambiguate_propagation(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict:
"""Clusters senses to root hubs using a labelled graph.
def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict:
"""Clusters senses to root hubs using a coloured graph.
This algorithm colours the graph using evolutionary graph theory
This algorithm propagations the graph using evolutionary graph theory
and calculates scores for each root hub given a context based on this graph.
Args:
......@@ -643,7 +613,7 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
A dictionary with root hub IDs as keys and context indices as values.
"""
coloured_graph = colour_graph(graph, root_hub_list)
labelled_graph = label_graph(graph, root_hub_list)
mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)}
......@@ -667,11 +637,11 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
else:
text = token.text
if text in coloured_graph.nodes:
if text in labelled_graph.nodes:
text_colour_dist = coloured_graph.node[text]['dist']
text_propagation_dist = labelled_graph.node[text]['dist']
if not any(text_colour_dist):
if not any(text_propagation_dist):
pass
......@@ -681,9 +651,9 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
root_hub_idx = root_hub_list.index(root_hub)
if nx.has_path(coloured_graph , text, root_hub):
if nx.has_path(labelled_graph , text, root_hub):
shortest_path = nx.shortest_path(coloured_graph ,
shortest_path = nx.shortest_path(labelled_graph ,
text,
root_hub,
'weight')
......@@ -693,10 +663,10 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
for i in range(1, len(shortest_path)):
sub_from, sub_to = shortest_path[i-1], shortest_path[i]
total_weight += \
coloured_graph[sub_from][sub_to]['weight']
labelled_graph[sub_from][sub_to]['weight']
score[root_hub_idx] += (1/(1+total_weight)) \
* coloured_graph.node[text]['dist'][root_hub_idx]
* labelled_graph.node[text]['dist'][root_hub_idx]
else:
......@@ -717,6 +687,88 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
return mapping_dict
##############################
# MST Disambiguation #
##############################
def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
"""Builds minimum spanning tree from graph and removes singletons.
Applies components algorithm from Véronis (2004) and removes singletons.
Args:
graph: Undirected weighted graph.
root_hub_list: List of strings of root hubs of graph.
target_string: Root of minimum spanning tree.
Returns:
Minimum spanning tree with target as root and root hubs as direct
children. Singletons removed.
"""
graph_copy = deepcopy(graph)
graph_copy.add_node(target_string)
for root_hub in root_hub_list:
graph_copy.add_edge(target_string,root_hub,weight=0)
minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
return minimum_spanning_tree
def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
"""Calculate score for a given component in a minimum spanning tree.
First the correct root for the component is chosen. If no root hub is
suitable, an empty array is returned. A score is calculated for the distance
of the component and its root and returned as part of an array filled with
zeroes.
Args:
graph: Minimum spanning tree.
component: Node (string) from which the distances are to be calculated.
root_hub_list: List of strings of root hubs (senses) of original graph.
Returns:
Array with one score for the correct root hub and filled with zeroes.
"""
root_hub_count = len(root_hub_list)
#Initialise score array.
score_array = np.zeros(root_hub_count)
# Find root of component.
distance_list = list()
for root_hub in root_hub_list:
if nx.has_path(graph, component, root_hub):
distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
else:
distance_list.append(0)
if sum(distance_list) == 0:
return score_array
root_idx = np.argmax(distance_list)
root = root_hub_list[root_idx]
shortest_path = nx.shortest_path(graph, component, root, 'weight')
total_weight = 0
# Add weights of every sub-path.
for i in range(1, len(shortest_path)):
sub_from, sub_to = shortest_path[i-1], shortest_path[i]
total_weight += graph[sub_from][sub_to]['weight']
score_array = np.zeros(root_hub_count)
score_array[root_idx] = 1/(1+total_weight)
return score_array
def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
context_list: list, topic_name: str) -> dict:
"""Matches contexts to senses.
......@@ -804,53 +856,11 @@ def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
return mapping_dict
def print_stats(stat_dict: dict) -> None:
"""Prints various statistics and logs them to file.
Args:
stat_dict: Dictionary with various statistics.
"""
stat_string = []
ts = time.gmtime()
key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain']
stat_string.append('Topic: {}.'.format(stat_dict['target']))
stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts)))
stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges']))
stat_string.append('Characteristic path length: {}.'.format(stat_dict['L']))
stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C']))
stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size']))
stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size']))
stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters']))
stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain']))
stat_string.append('Sense inventory:')
for hub in stat_dict['hubs'].keys():
stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub])))
print('\n[A] '+'\n[A] '.join(stat_string)+'\n')
with open('statistics.txt', 'a') as stat_file:
stat_file.write('\n '.join(stat_string)+'\n\n')
write_header = not os.path.exists('.statistics.tsv')
with open('.statistics.tsv', 'a') as stat_file:
if write_header:
stat_file.write('\t'.join(key_list)+'\n')
stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n')
##############################
# Statistics #
##############################
def global_clustering_coefficient(graph: nx.Graph) -> float:
"""Calculates global clustering coefficient from graph.
......@@ -918,6 +928,56 @@ def characteristic_path_length(graph: nx.Graph) -> float:
return np.mean(path_length_list)
def print_stats(stat_dict: dict) -> None:
"""Prints various statistics and logs them to file.
Args:
stat_dict: Dictionary with various statistics.
"""
stat_string = []
ts = time.gmtime()
key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain']
stat_string.append('Topic: {}.'.format(stat_dict['target']))
stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts)))
stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges']))
stat_string.append('Characteristic path length: {}.'.format(stat_dict['L']))
stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C']))
stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size']))
stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size']))
stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters']))
stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain']))
stat_string.append('Sense inventory:')
for hub in stat_dict['hubs'].keys():
stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub])))
print('\n[A] '+'\n[A] '.join(stat_string)+'\n')
with open('statistics.txt', 'a') as stat_file:
stat_file.write('\n '.join(stat_string)+'\n\n')
write_header = not os.path.exists('.statistics.tsv')
with open('.statistics.tsv', 'a') as stat_file:
if write_header:
stat_file.write('\t'.join(key_list)+'\n')
stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n')
##############################
# main #
##############################
def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
"""Calls induction and disambiguation functions, performs main task.
......@@ -955,7 +1015,7 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
stat_dict['C_rand'] = 2 * mean_degree/node_count
colour_rank = config.colour_rank
propagation_rank = config.colour_rank
mst_rank = config.mst_rank
#Merges Mappings according to pipeline
......@@ -963,10 +1023,10 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
#matches senses to clusters
print('[a]', 'Disambiguating results.\t('+topic_name+')')
if colour_rank != 0:
if propagation_rank != 0:
print('[a]', 'Colouring graph.\t('+topic_name+')')
mapping_dict[colour_rank] = disambiguate_colour(graph, root_hub_list,
print('[a]', 'Propagating through graph.\t('+topic_name+')')
mapping_dict[propagation_rank] = disambiguate_propagation(graph, root_hub_list,
result_dict[topic_id])
if mst_rank != 0:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment