From a8caa669e195e29eedfc20b32accba16e247dc54 Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Wed, 21 Mar 2018 16:54:36 +0100
Subject: [PATCH] Update absinth.py

---
 src/absinth.py | 139 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 115 insertions(+), 24 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index 06b74ad..294e55c 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -24,13 +24,14 @@ Modifiers:
 """
 
 import sys
-print('[A] Loading ' + sys.argv[0] + '.\n')
+print('[a] Loading ' + sys.argv[0] + '.\n')
 import config
 import networkx as nx # for visualisation
 import numpy as np
 import os # for reading files
 import pprint
 import re
+import scipy.special
 import spacy # for nlp
 import time
 
@@ -326,6 +327,11 @@ def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph:
                 
                 pass
     
+    # Remove singletons, deepcopy for iteration while being altered.
+    for node in deepcopy(cooccurrence_graph).nodes:
+        if len(cooccurrence_graph.adj[node]) == 0:
+            cooccurrence_graph.remove_node(node)
+    
     return cooccurrence_graph
 
 
@@ -424,11 +430,6 @@ def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.G
         
     minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
     
-    # Remove singletons, deepcopy for iteration while being altered.
-    for node in deepcopy(minimum_spanning_tree).nodes:
-        if len(minimum_spanning_tree.adj[node]) == 0:
-            minimum_spanning_tree.remove_node(node)
-    
     return minimum_spanning_tree
 
 
@@ -519,8 +520,8 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
     print('[a]', 'Building graph.\t('+topic_name+')')
     graph = build_graph(node_freq_dict, edge_freq_dict)
     
-    stat_dict['node count'] = len(graph.nodes)
-    stat_dict['edge count'] = len(graph.edges)
+    stat_dict['nodes'] = len(graph.nodes)
+    stat_dict['edges'] = len(graph.edges)
 
     #finds root hubs (senses) within the graph + more filters for these
     print('[a]', 'Collecting root hubs.\t('+topic_name+')')
@@ -807,22 +808,101 @@ def print_stats(stat_dict: dict) -> None:
     
     ts = time.gmtime()
     
-    stat_string.append('[A] Topic:\t{}.'.format(stat_dict['target']))
-    stat_string.append('[A] Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts)))
-    stat_string.append('[A] Nodes: {}\tEdges: {}.'.format(stat_dict['node count'],stat_dict['edge count']))
-    stat_string.append('[A] Mean cluster length (harmonic):\t{}.'.format(stat_dict['hmean_cluster_length']))
-    stat_string.append('[A] Mean cluster length (arithmetic):\t{}.'.format(stat_dict['mean_cluster_length']))
-    stat_string.append('[A] Number of clusters: {}.'.format(stat_dict['cluster_count']))
-    stat_string.append('[A] Tuples gained through merging: {}.'.format(stat_dict['merge_gain']))
-    stat_string.append('[A] Sense inventory:')
+    key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain']
+    
+    stat_string.append('Topic: {}.'.format(stat_dict['target']))
+    stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts)))
+    stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges']))
+    stat_string.append('Characteristic path length: {}.'.format(stat_dict['L']))
+    stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C']))
+    stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size']))
+    stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size']))
+    stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters']))
+    stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain']))
+    stat_string.append('Sense inventory:')
     for hub in stat_dict['hubs'].keys():
-        stat_string.append('[A] {}:\t{}.'.format(hub, ", ".join(stat_dict['hubs'][hub])))
+        stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub])))
     
-    with open('stats.txt', 'a') as stat_file:
-        stat_file.write('\n'.join(stat_string)+'\n\n')
-        print('\n'+'\n'.join(stat_string)+'\n')
+    print('\n[A] '+'\n[A] '.join(stat_string)+'\n')
+    
+    write_header = not os.path.exists('.statistics.tsv')
+    
+    with open('.statistics.tsv', 'a') as stat_file:
+        
+        if write_header:
+            
+            stat_file.write('\t'.join(key_list)+'\n')
+            
+        stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n')
+            
         
 
+def global_clustering_coefficient(graph: nx.Graph) -> float:
+    """Calculates global clustering coefficient from graph.
+    
+    Iterates over every node and calculates the global coefficient as a mean
+    of every local clustering coefficient. 
+    
+    Args:
+        graph: Undirected graph.
+        
+    Returns:
+        Global coefficient.
+    """
+    
+    local_coefficient_list = list()
+    
+    for node in graph.nodes:
+        
+        neighbor_list = graph.adj[node]
+        
+        neighbor_edge_list = [(x,y) for x in neighbor_list 
+                              for y in neighbor_list if x<y]
+        
+        if len(neighbor_edge_list) == 0:
+            
+            local_coefficient_list.append(0)
+        
+        else:
+            
+            edge_count = 0
+            for x,y in neighbor_edge_list:
+                if graph.has_edge(x,y):
+                    edge_count += 1
+            
+            local_coefficient_list.append(edge_count/len(neighbor_edge_list))
+        
+    return np.mean(local_coefficient_list)
+
+
+def characteristic_path_length(graph: nx.Graph) -> float:
+    """Calculates characteristic path length from graph.
+    
+    Iterates over every node tuple and calculates the shortest path between them.
+    The average path length is returned. Tuples without path are ignored.
+    
+    Args:
+        graph: Undirected graph.
+        
+    Returns:
+        Global coefficient.
+    """
+    
+    path_length_list = list()
+    
+    path_list = [(x,y) for x in graph.nodes for y in graph.nodes if x<y]
+    
+    for path in path_list:
+        
+        if nx.has_path(graph,*path):
+            
+            shortest_path = nx.shortest_path(graph,*path)
+            
+            path_length_list.append(len(shortest_path))
+        
+    return np.mean(path_length_list)
+
+
 def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
     """Calls induction and disambiguation functions, performs main task.
 
@@ -849,6 +929,17 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
         
         graph, root_hub_list, stat_dict = induce(topic_name, result_dict[topic_id])
         
+        stat_dict['L'] = characteristic_path_length(graph)
+        stat_dict['C'] = global_clustering_coefficient(graph)
+        
+        edge_count = len(graph.edges)
+        node_count = len(graph.nodes)
+        mean_degree = edge_count/node_count
+        
+        stat_dict['L_rand'] = np.log(node_count)/np.log(mean_degree)
+        stat_dict['C_rand'] = 2 * mean_degree/node_count
+        
+        
         colour_rank = config.colour_rank
         mst_rank = config.mst_rank
         
@@ -897,7 +988,7 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
                     else:
                         merged_mapping_dict[topic] = [result]
                         
-        stat_dict['merge_gain'] = merged_entry_count
+        stat_dict['pipe_gain'] = merged_entry_count
         
         #collect statistics from result.
         cluster_count = 0
@@ -912,9 +1003,9 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
                 cluster_count += 1
                 cluster_length_list.append(cluster_length)
                 
-        stat_dict['hmean_cluster_length'] = stats.hmean(cluster_length_list)
-        stat_dict['mean_cluster_length'] = np.mean(cluster_length_list)
-        stat_dict['cluster_count'] = cluster_count
+        stat_dict['h_mean_size'] = stats.hmean(cluster_length_list)
+        stat_dict['a_mean_size'] = np.mean(cluster_length_list)
+        stat_dict['clusters'] = cluster_count
 
         print('[a]', 'Writing to file.\t('+topic_name+')')
         
-- 
GitLab