diff --git a/src/absinth.py b/src/absinth.py index 69b4dc56d3a0d0b2c07c04f45e049a3f01795d9d..f682c7022e4f580c7c2ab8e31affee35568c124e 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -7,7 +7,7 @@ matches a list of contexts to each. The method to achieve this is a modified reimplementation of Véronis' Hyperlex (2004). Example: - The function can be called with the following command.: + The function can be called with the following command: $ python3 absinth.py @@ -23,9 +23,15 @@ Modifiers: """ + +########################## +# Dependencies # +########################## + import sys print('[a] Loading ' + sys.argv[0] + '.\n') import config +import json import networkx as nx # for visualisation import numpy as np import os # for reading files @@ -42,6 +48,9 @@ from scipy import stats nlp = spacy.load('en') # standard english nlp +########################## +# Preprocessing # +########################## def read_dataset(data_path: str) -> (dict, dict): """Collects topics.txt and results.txt. @@ -85,6 +94,102 @@ def read_dataset(data_path: str) -> (dict, dict): return results, topics +########################## +# Induction # +########################## + + +def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): + """Induces word senses for a given topic from corpus. + + Counts frequencies from corpus and search result list, builds graph from + these counts (with some filters). Root hubs (senses) are collected from + this graph. + + Args: + topic_name: Target string. + result_list: List of search result (context) strings. + + Returns: + A cooccurrence graph, + a list of root hub strings (senses) + and dictionary of various statistics. + """ + + stat_dict = dict() + + stat_dict['target'] = topic_name + + #in topics longer than two words, the leading 'the' can generally be removed without changing the sense + if topic_name[:4] == 'the_' and topic_name.count('_') > 1: + + target_string = topic_name[4:] + + else: + + target_string = topic_name + + print('[a]', 'Counting nodes and edges.\t('+topic_name+')') + + #Check if frequencies were already counted before. + + node_dict_name = topic_name+'_node.json' + edge_dict_name = topic_name+'_edge.json' + + graph_in_existence = False + for graph_name in os.listdir(config.graph): + + if topic_name in graph_name: + + graph_in_existence = True + + with open(node_dict_name, 'r') as node_file, open(edge_dict_name, 'r') as edge_file: + + node_freq_dict = json.load(node_file) + edge_freq_dict = json.load(edge_file) + + continue + + if graph_in_existence == False: + + node_freq_dict, edge_freq_dict = frequencies(target_string, result_list) + + with open(node_dict_name, 'w') as node_file, open(edge_dict_name, 'w') as edge_file: + node_file.write(json.dumps(node_freq_dict)) + edge_file.write(json.dumps(edge_freq_dict)) + + #builds graph from these dictionaries, also applies multiple filters + print('[a]', 'Building graph.\t('+topic_name+')') + graph = build_graph(node_freq_dict, edge_freq_dict) + + for string in topic_name.split('_'): + if string in graph.nodes: + graph.remove_node(string) + + stat_dict['nodes'] = len(graph.nodes) + stat_dict['edges'] = len(graph.edges) + + #finds root hubs (senses) within the graph + more filters for these + print('[a]', 'Collecting root hubs.\t('+topic_name+')') + root_hub_list = root_hubs(graph, edge_freq_dict) + + #adds sense inventory to buffer with some common neighbors for context + stat_dict['hubs'] = dict() + + for root_hub in root_hub_list: + + by_frequency = lambda node: edge_freq_dict[root_hub,node] \ + if root_hub < node \ + else edge_freq_dict[node, root_hub] + + most_frequent_neighbor_list = sorted(graph.adj[root_hub], + key=by_frequency, reverse=True) + + stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6] + + return graph, root_hub_list, stat_dict + + def frequencies(target_string: str, search_result_list: list) -> (dict, dict): """Counts occurrences of nodes and cooccurrences. @@ -408,151 +513,16 @@ def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list: return hub_list -def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph: - """Builds minimum spanning tree from graph and removes singletons. - - Applies components algorithm from Véronis (2004) and removes singletons. - - Args: - graph: Undirected weighted graph. - root_hub_list: List of strings of root hubs of graph. - target_string: Root of minimum spanning tree. - - Returns: - Minimum spanning tree with target as root and root hubs as direct - children. Singletons removed. - """ - - graph_copy = deepcopy(graph) - - graph_copy.add_node(target_string) - for root_hub in root_hub_list: - graph_copy.add_edge(target_string,root_hub,weight=0) - - minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy) - - return minimum_spanning_tree - - -def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array: - """Calculate score for a given component in a minimum spanning tree. - - First the correct root for the component is chosen. If no root hub is - suitable, an empty array is returned. A score is calculated for the distance - of the component and its root and returned as part of an array filled with - zeroes. - - Args: - graph: Minimum spanning tree. - component: Node (string) from which the distances are to be calculated. - root_hub_list: List of strings of root hubs (senses) of original graph. - - Returns: - Array with one score for the correct root hub and filled with zeroes. - """ - - root_hub_count = len(root_hub_list) - - #Initialise score array. - score_array = np.zeros(root_hub_count) - - # Find root of component. - distance_list = list() - for root_hub in root_hub_list: - if nx.has_path(graph, component, root_hub): - distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub)))) - else: - distance_list.append(0) - - if sum(distance_list) == 0: - return score_array - - root_idx = np.argmax(distance_list) - root = root_hub_list[root_idx] - - shortest_path = nx.shortest_path(graph, component, root, 'weight') - total_weight = 0 - - # Add weights of every sub-path. - for i in range(1, len(shortest_path)): - sub_from, sub_to = shortest_path[i-1], shortest_path[i] - total_weight += graph[sub_from][sub_to]['weight'] - - score_array = np.zeros(root_hub_count) - score_array[root_idx] = 1/(1+total_weight) - - return score_array - - -def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): - """Induces word senses for a given topic from corpus. - - Counts frequencies from corpus and search result list, builds graph from - these counts (with some filters). Root hubs (senses) are collected from - this graph. - - Args: - topic_name: Target string. - result_list: List of search result (context) strings. - - Returns: - A cooccurrence graph, - a list of root hub strings (senses) - and dictionary of various statistics. - """ - - stat_dict = dict() - - stat_dict['target'] = topic_name - - #in topics longer than two words, the leading 'the' can generally be removed without changing the sense - if topic_name[:4] == 'the_' and topic_name.count('_') > 1: - - target_string = topic_name[4:] - - else: - - target_string = topic_name - - print('[a]', 'Counting nodes and edges.\t('+topic_name+')') - node_freq_dict, edge_freq_dict = frequencies(target_string, result_list) - - #builds graph from these dictionaries, also applies multiple filters - print('[a]', 'Building graph.\t('+topic_name+')') - graph = build_graph(node_freq_dict, edge_freq_dict) - - for string in topic_name.split('_'): - if string in graph.nodes: - graph.remove_node(string) - - stat_dict['nodes'] = len(graph.nodes) - stat_dict['edges'] = len(graph.edges) - - #finds root hubs (senses) within the graph + more filters for these - print('[a]', 'Collecting root hubs.\t('+topic_name+')') - root_hub_list = root_hubs(graph, edge_freq_dict) - - #adds sense inventory to buffer with some common neighbors for context - stat_dict['hubs'] = dict() - - for root_hub in root_hub_list: - - by_frequency = lambda node: edge_freq_dict[root_hub,node] \ - if root_hub < node \ - else edge_freq_dict[node, root_hub] - - most_frequent_neighbor_list = sorted(graph.adj[root_hub], - key=by_frequency, reverse=True) - - stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6] - return graph, root_hub_list, stat_dict +############################## +# Propagation Disambiguation # +############################## -def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: - """Colours graph accoring to root hubs. +def label_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: + """propagations graph accoring to root hubs. - Evolving network that colours neighboring nodes iterative. See sentiment + Evolving network that propagations neighboring nodes iterative. See sentiment propagation. Args: @@ -560,7 +530,7 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: root_hub_list: List of senses. Returns: - Coloured graph. + labelled graph. """ @@ -579,7 +549,7 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: graph.node[node]['sense'] = None - max_iteration_count = config.max_colour_iteration_count + max_iteration_count = config.max_propagation_iteration_count iteration_count = 0 stable = False @@ -607,12 +577,12 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: graph.node[node]['dist'].append(neighbor_weight_list) - old_colour = graph_copy.node[node]['sense'] - new_colour = np.argmax(np.mean(graph.node[node]['dist'], axis=0)) + old_propagation = graph_copy.node[node]['sense'] + new_propagation = np.argmax(np.mean(graph.node[node]['dist'], axis=0)) - if old_colour != new_colour: + if old_propagation != new_propagation: stable = False - graph.node[node]['sense'] = new_colour + graph.node[node]['sense'] = new_propagation else: pass @@ -626,12 +596,12 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: graph.node[node]['dist'] = np.mean(graph.node[node]['dist'], axis=0) return graph + + +def disambiguate_propagation(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict: + """Clusters senses to root hubs using a labelled graph. - -def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict: - """Clusters senses to root hubs using a coloured graph. - - This algorithm colours the graph using evolutionary graph theory + This algorithm propagations the graph using evolutionary graph theory and calculates scores for each root hub given a context based on this graph. Args: @@ -643,7 +613,7 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list A dictionary with root hub IDs as keys and context indices as values. """ - coloured_graph = colour_graph(graph, root_hub_list) + labelled_graph = label_graph(graph, root_hub_list) mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)} @@ -667,11 +637,11 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list else: text = token.text - if text in coloured_graph.nodes: + if text in labelled_graph.nodes: - text_colour_dist = coloured_graph.node[text]['dist'] + text_propagation_dist = labelled_graph.node[text]['dist'] - if not any(text_colour_dist): + if not any(text_propagation_dist): pass @@ -681,9 +651,9 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list root_hub_idx = root_hub_list.index(root_hub) - if nx.has_path(coloured_graph , text, root_hub): + if nx.has_path(labelled_graph , text, root_hub): - shortest_path = nx.shortest_path(coloured_graph , + shortest_path = nx.shortest_path(labelled_graph , text, root_hub, 'weight') @@ -693,10 +663,10 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list for i in range(1, len(shortest_path)): sub_from, sub_to = shortest_path[i-1], shortest_path[i] total_weight += \ - coloured_graph[sub_from][sub_to]['weight'] + labelled_graph[sub_from][sub_to]['weight'] score[root_hub_idx] += (1/(1+total_weight)) \ - * coloured_graph.node[text]['dist'][root_hub_idx] + * labelled_graph.node[text]['dist'][root_hub_idx] else: @@ -717,6 +687,88 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list return mapping_dict + +############################## +# MST Disambiguation # +############################## + + +def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph: + """Builds minimum spanning tree from graph and removes singletons. + + Applies components algorithm from Véronis (2004) and removes singletons. + + Args: + graph: Undirected weighted graph. + root_hub_list: List of strings of root hubs of graph. + target_string: Root of minimum spanning tree. + + Returns: + Minimum spanning tree with target as root and root hubs as direct + children. Singletons removed. + """ + + graph_copy = deepcopy(graph) + + graph_copy.add_node(target_string) + for root_hub in root_hub_list: + graph_copy.add_edge(target_string,root_hub,weight=0) + + minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy) + + return minimum_spanning_tree + + +def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array: + """Calculate score for a given component in a minimum spanning tree. + + First the correct root for the component is chosen. If no root hub is + suitable, an empty array is returned. A score is calculated for the distance + of the component and its root and returned as part of an array filled with + zeroes. + + Args: + graph: Minimum spanning tree. + component: Node (string) from which the distances are to be calculated. + root_hub_list: List of strings of root hubs (senses) of original graph. + + Returns: + Array with one score for the correct root hub and filled with zeroes. + """ + + root_hub_count = len(root_hub_list) + + #Initialise score array. + score_array = np.zeros(root_hub_count) + + # Find root of component. + distance_list = list() + for root_hub in root_hub_list: + if nx.has_path(graph, component, root_hub): + distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub)))) + else: + distance_list.append(0) + + if sum(distance_list) == 0: + return score_array + + root_idx = np.argmax(distance_list) + root = root_hub_list[root_idx] + + shortest_path = nx.shortest_path(graph, component, root, 'weight') + total_weight = 0 + + # Add weights of every sub-path. + for i in range(1, len(shortest_path)): + sub_from, sub_to = shortest_path[i-1], shortest_path[i] + total_weight += graph[sub_from][sub_to]['weight'] + + score_array = np.zeros(root_hub_count) + score_array[root_idx] = 1/(1+total_weight) + + return score_array + + def disambiguate_mst(graph: nx.Graph, root_hub_list: list, context_list: list, topic_name: str) -> dict: """Matches contexts to senses. @@ -804,53 +856,11 @@ def disambiguate_mst(graph: nx.Graph, root_hub_list: list, return mapping_dict -def print_stats(stat_dict: dict) -> None: - """Prints various statistics and logs them to file. - - Args: - stat_dict: Dictionary with various statistics. - - """ - - stat_string = [] - - ts = time.gmtime() - - key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain'] - - stat_string.append('Topic: {}.'.format(stat_dict['target'])) - stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts))) - stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges'])) - stat_string.append('Characteristic path length: {}.'.format(stat_dict['L'])) - stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C'])) - stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size'])) - stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size'])) - stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters'])) - stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain'])) - stat_string.append('Sense inventory:') - for hub in stat_dict['hubs'].keys(): - stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub]))) - - print('\n[A] '+'\n[A] '.join(stat_string)+'\n') - - with open('statistics.txt', 'a') as stat_file: - - stat_file.write('\n '.join(stat_string)+'\n\n') - - write_header = not os.path.exists('.statistics.tsv') - - with open('.statistics.tsv', 'a') as stat_file: - - if write_header: - - stat_file.write('\t'.join(key_list)+'\n') - - stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n') - - - - - + +############################## +# Statistics # +############################## + def global_clustering_coefficient(graph: nx.Graph) -> float: """Calculates global clustering coefficient from graph. @@ -918,6 +928,56 @@ def characteristic_path_length(graph: nx.Graph) -> float: return np.mean(path_length_list) +def print_stats(stat_dict: dict) -> None: + """Prints various statistics and logs them to file. + + Args: + stat_dict: Dictionary with various statistics. + + """ + + stat_string = [] + + ts = time.gmtime() + + key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain'] + + stat_string.append('Topic: {}.'.format(stat_dict['target'])) + stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts))) + stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges'])) + stat_string.append('Characteristic path length: {}.'.format(stat_dict['L'])) + stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C'])) + stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size'])) + stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size'])) + stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters'])) + stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain'])) + stat_string.append('Sense inventory:') + for hub in stat_dict['hubs'].keys(): + stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub]))) + + print('\n[A] '+'\n[A] '.join(stat_string)+'\n') + + with open('statistics.txt', 'a') as stat_file: + + stat_file.write('\n '.join(stat_string)+'\n\n') + + write_header = not os.path.exists('.statistics.tsv') + + with open('.statistics.tsv', 'a') as stat_file: + + if write_header: + + stat_file.write('\t'.join(key_list)+'\n') + + stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n') + + + +############################## +# main # +############################## + + def main(topic_id: int, topic_name: str, result_dict: dict) -> None: """Calls induction and disambiguation functions, performs main task. @@ -955,7 +1015,7 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: stat_dict['C_rand'] = 2 * mean_degree/node_count - colour_rank = config.colour_rank + propagation_rank = config.colour_rank mst_rank = config.mst_rank #Merges Mappings according to pipeline @@ -963,10 +1023,10 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: #matches senses to clusters print('[a]', 'Disambiguating results.\t('+topic_name+')') - if colour_rank != 0: + if propagation_rank != 0: - print('[a]', 'Colouring graph.\t('+topic_name+')') - mapping_dict[colour_rank] = disambiguate_colour(graph, root_hub_list, + print('[a]', 'Propagating through graph.\t('+topic_name+')') + mapping_dict[propagation_rank] = disambiguate_propagation(graph, root_hub_list, result_dict[topic_id]) if mst_rank != 0: