diff --git a/src/absinth.py b/src/absinth.py index 06b74ade9f17363718f20cf8cd368bad5dde7411..294e55c4bc47be478f21cc7ff596ef1f52b525b7 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -24,13 +24,14 @@ Modifiers: """ import sys -print('[A] Loading ' + sys.argv[0] + '.\n') +print('[a] Loading ' + sys.argv[0] + '.\n') import config import networkx as nx # for visualisation import numpy as np import os # for reading files import pprint import re +import scipy.special import spacy # for nlp import time @@ -326,6 +327,11 @@ def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph: pass + # Remove singletons, deepcopy for iteration while being altered. + for node in deepcopy(cooccurrence_graph).nodes: + if len(cooccurrence_graph.adj[node]) == 0: + cooccurrence_graph.remove_node(node) + return cooccurrence_graph @@ -424,11 +430,6 @@ def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.G minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy) - # Remove singletons, deepcopy for iteration while being altered. - for node in deepcopy(minimum_spanning_tree).nodes: - if len(minimum_spanning_tree.adj[node]) == 0: - minimum_spanning_tree.remove_node(node) - return minimum_spanning_tree @@ -519,8 +520,8 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): print('[a]', 'Building graph.\t('+topic_name+')') graph = build_graph(node_freq_dict, edge_freq_dict) - stat_dict['node count'] = len(graph.nodes) - stat_dict['edge count'] = len(graph.edges) + stat_dict['nodes'] = len(graph.nodes) + stat_dict['edges'] = len(graph.edges) #finds root hubs (senses) within the graph + more filters for these print('[a]', 'Collecting root hubs.\t('+topic_name+')') @@ -807,22 +808,101 @@ def print_stats(stat_dict: dict) -> None: ts = time.gmtime() - stat_string.append('[A] Topic:\t{}.'.format(stat_dict['target'])) - stat_string.append('[A] Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts))) - stat_string.append('[A] Nodes: {}\tEdges: {}.'.format(stat_dict['node count'],stat_dict['edge count'])) - stat_string.append('[A] Mean cluster length (harmonic):\t{}.'.format(stat_dict['hmean_cluster_length'])) - stat_string.append('[A] Mean cluster length (arithmetic):\t{}.'.format(stat_dict['mean_cluster_length'])) - stat_string.append('[A] Number of clusters: {}.'.format(stat_dict['cluster_count'])) - stat_string.append('[A] Tuples gained through merging: {}.'.format(stat_dict['merge_gain'])) - stat_string.append('[A] Sense inventory:') + key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain'] + + stat_string.append('Topic: {}.'.format(stat_dict['target'])) + stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts))) + stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges'])) + stat_string.append('Characteristic path length: {}.'.format(stat_dict['L'])) + stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C'])) + stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size'])) + stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size'])) + stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters'])) + stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain'])) + stat_string.append('Sense inventory:') for hub in stat_dict['hubs'].keys(): - stat_string.append('[A] {}:\t{}.'.format(hub, ", ".join(stat_dict['hubs'][hub]))) + stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub]))) - with open('stats.txt', 'a') as stat_file: - stat_file.write('\n'.join(stat_string)+'\n\n') - print('\n'+'\n'.join(stat_string)+'\n') + print('\n[A] '+'\n[A] '.join(stat_string)+'\n') + + write_header = not os.path.exists('.statistics.tsv') + + with open('.statistics.tsv', 'a') as stat_file: + + if write_header: + + stat_file.write('\t'.join(key_list)+'\n') + + stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n') + +def global_clustering_coefficient(graph: nx.Graph) -> float: + """Calculates global clustering coefficient from graph. + + Iterates over every node and calculates the global coefficient as a mean + of every local clustering coefficient. + + Args: + graph: Undirected graph. + + Returns: + Global coefficient. + """ + + local_coefficient_list = list() + + for node in graph.nodes: + + neighbor_list = graph.adj[node] + + neighbor_edge_list = [(x,y) for x in neighbor_list + for y in neighbor_list if x<y] + + if len(neighbor_edge_list) == 0: + + local_coefficient_list.append(0) + + else: + + edge_count = 0 + for x,y in neighbor_edge_list: + if graph.has_edge(x,y): + edge_count += 1 + + local_coefficient_list.append(edge_count/len(neighbor_edge_list)) + + return np.mean(local_coefficient_list) + + +def characteristic_path_length(graph: nx.Graph) -> float: + """Calculates characteristic path length from graph. + + Iterates over every node tuple and calculates the shortest path between them. + The average path length is returned. Tuples without path are ignored. + + Args: + graph: Undirected graph. + + Returns: + Global coefficient. + """ + + path_length_list = list() + + path_list = [(x,y) for x in graph.nodes for y in graph.nodes if x<y] + + for path in path_list: + + if nx.has_path(graph,*path): + + shortest_path = nx.shortest_path(graph,*path) + + path_length_list.append(len(shortest_path)) + + return np.mean(path_length_list) + + def main(topic_id: int, topic_name: str, result_dict: dict) -> None: """Calls induction and disambiguation functions, performs main task. @@ -849,6 +929,17 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: graph, root_hub_list, stat_dict = induce(topic_name, result_dict[topic_id]) + stat_dict['L'] = characteristic_path_length(graph) + stat_dict['C'] = global_clustering_coefficient(graph) + + edge_count = len(graph.edges) + node_count = len(graph.nodes) + mean_degree = edge_count/node_count + + stat_dict['L_rand'] = np.log(node_count)/np.log(mean_degree) + stat_dict['C_rand'] = 2 * mean_degree/node_count + + colour_rank = config.colour_rank mst_rank = config.mst_rank @@ -897,7 +988,7 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: else: merged_mapping_dict[topic] = [result] - stat_dict['merge_gain'] = merged_entry_count + stat_dict['pipe_gain'] = merged_entry_count #collect statistics from result. cluster_count = 0 @@ -912,9 +1003,9 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: cluster_count += 1 cluster_length_list.append(cluster_length) - stat_dict['hmean_cluster_length'] = stats.hmean(cluster_length_list) - stat_dict['mean_cluster_length'] = np.mean(cluster_length_list) - stat_dict['cluster_count'] = cluster_count + stat_dict['h_mean_size'] = stats.hmean(cluster_length_list) + stat_dict['a_mean_size'] = np.mean(cluster_length_list) + stat_dict['clusters'] = cluster_count print('[a]', 'Writing to file.\t('+topic_name+')')