Saves graphs now.

121f9d68 · Victor Zimmermann · ffce1462 · 121f9d68
Commit 121f9d68 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -7,7 +7,7 @@ matches a list of contexts to each. The method to achieve this is a modified
 reimplementation of Véronis' Hyperlex (2004).

 Example:
-    The function can be called with the following command.:
+    The function can be called with the following command:
    
        $ python3 absinth.py
        
@@ -23,9 +23,15 @@ Modifiers:

 """

+
+##########################
+#      Dependencies      #
+##########################
+
 import sys
 print('[a] Loading ' + sys.argv[0] + '.\n')
 import config
+import json
 import networkx as nx # for visualisation
 import numpy as np
 import os # for reading files
@@ -42,6 +48,9 @@ from scipy import stats

 nlp = spacy.load('en') # standard english nlp

+##########################
+#      Preprocessing     #
+##########################

 def read_dataset(data_path: str) -> (dict, dict):
    """Collects topics.txt and results.txt.
@@ -85,6 +94,102 @@ def read_dataset(data_path: str) -> (dict, dict):
    return results, topics


+##########################
+#        Induction       #
+##########################
+
+
+def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
+    """Induces word senses for a given topic from corpus.
+    
+    Counts frequencies from corpus and search result list, builds graph from
+    these counts (with some filters). Root hubs (senses) are collected from
+    this graph.
+
+    Args:
+        topic_name: Target string.
+        result_list: List of search result (context) strings.
+        
+    Returns:
+        A cooccurrence graph,
+            a list of root hub strings (senses) 
+            and dictionary of various statistics.
+    """
+    
+    stat_dict = dict()
+    
+    stat_dict['target'] = topic_name
+        
+    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+    if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
+        
+        target_string = topic_name[4:]
+        
+    else:
+        
+        target_string = topic_name
+    
+    print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
+
+    #Check if frequencies were already counted before.
+
+    node_dict_name = topic_name+'_node.json'
+    edge_dict_name = topic_name+'_edge.json'
+
+    graph_in_existence = False
+    for graph_name in os.listdir(config.graph):
+        
+        if topic_name in graph_name:
+
+            graph_in_existence = True
+
+            with open(node_dict_name, 'r') as node_file, open(edge_dict_name, 'r') as edge_file:
+        
+                node_freq_dict = json.load(node_file)
+                edge_freq_dict = json.load(edge_file)
+
+            continue
+    
+    if graph_in_existence == False:
+
+        node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
+        
+        with open(node_dict_name, 'w') as node_file, open(edge_dict_name, 'w') as edge_file:
+            node_file.write(json.dumps(node_freq_dict))
+            edge_file.write(json.dumps(edge_freq_dict))
+
+    #builds graph from these dictionaries, also applies multiple filters
+    print('[a]', 'Building graph.\t('+topic_name+')')
+    graph = build_graph(node_freq_dict, edge_freq_dict)
+    
+    for string in topic_name.split('_'):
+        if string in graph.nodes:
+            graph.remove_node(string)
+    
+    stat_dict['nodes'] = len(graph.nodes)
+    stat_dict['edges'] = len(graph.edges)
+
+    #finds root hubs (senses) within the graph + more filters for these
+    print('[a]', 'Collecting root hubs.\t('+topic_name+')')
+    root_hub_list = root_hubs(graph, edge_freq_dict)
+    
+    #adds sense inventory to buffer with some common neighbors for context
+    stat_dict['hubs'] = dict()
+    
+    for root_hub in root_hub_list:
+    
+        by_frequency = lambda node: edge_freq_dict[root_hub,node] \
+                                        if root_hub < node \
+                                        else edge_freq_dict[node, root_hub]
+                                    
+        most_frequent_neighbor_list = sorted(graph.adj[root_hub],
+                                                key=by_frequency, reverse=True) 
+        
+        stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]
+
+    return graph, root_hub_list, stat_dict
+
+
 def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
    """Counts occurrences of nodes and cooccurrences.
    
@@ -408,151 +513,16 @@ def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
    return hub_list


-def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
-    """Builds minimum spanning tree from graph and removes singletons.
-    
-    Applies components algorithm from Véronis (2004) and removes singletons.
-    
-    Args:
-        graph: Undirected weighted graph.
-        root_hub_list: List of strings of root hubs of graph.
-        target_string: Root of minimum spanning tree.
-        
-    Returns:
-        Minimum spanning tree with target as root and root hubs as direct
-            children. Singletons removed.
-    """
-    
-    graph_copy = deepcopy(graph)
-    
-    graph_copy.add_node(target_string)
-    for root_hub in root_hub_list:
-        graph_copy.add_edge(target_string,root_hub,weight=0)
-        
-    minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
-    
-    return minimum_spanning_tree
-
-
-def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
-    """Calculate score for a given component in a minimum spanning tree.
-    
-    First the correct root for the component is chosen. If no root hub is
-    suitable, an empty array is returned. A score is calculated for the distance
-    of the component and its root and returned as part of an array filled with
-    zeroes.
-    
-    Args:
-        graph: Minimum spanning tree.
-        component: Node (string) from which the distances are to be calculated.
-        root_hub_list: List of strings of root hubs (senses) of original graph.
-    
-    Returns:
-        Array with one score for the correct root hub and filled with zeroes.
-    """
-    
-    root_hub_count = len(root_hub_list)
-    
-    #Initialise score array.
-    score_array = np.zeros(root_hub_count)
-    
-    # Find root of component.
-    distance_list = list()
-    for root_hub in root_hub_list:
-        if nx.has_path(graph, component, root_hub):
-            distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
-        else:
-            distance_list.append(0)
-            
-    if sum(distance_list) == 0:
-        return score_array
-            
-    root_idx = np.argmax(distance_list)
-    root = root_hub_list[root_idx]
-    
-    shortest_path = nx.shortest_path(graph, component, root, 'weight')
-    total_weight = 0
-
-    # Add weights of every sub-path.
-    for i in range(1, len(shortest_path)):
-        sub_from, sub_to = shortest_path[i-1], shortest_path[i]
-        total_weight += graph[sub_from][sub_to]['weight']
-
-    score_array = np.zeros(root_hub_count)
-    score_array[root_idx] = 1/(1+total_weight)
-    
-    return score_array
-
-
-def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
-    """Induces word senses for a given topic from corpus.
-    
-    Counts frequencies from corpus and search result list, builds graph from
-    these counts (with some filters). Root hubs (senses) are collected from
-    this graph.
-
-    Args:
-        topic_name: Target string.
-        result_list: List of search result (context) strings.
-        
-    Returns:
-        A cooccurrence graph,
-            a list of root hub strings (senses) 
-            and dictionary of various statistics.
-    """
-    
-    stat_dict = dict()
-    
-    stat_dict['target'] = topic_name
-        
-    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
-    if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
-        
-        target_string = topic_name[4:]
-        
-    else:
-        
-        target_string = topic_name
-    
-    print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
-    
-    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.\t('+topic_name+')')
-    graph = build_graph(node_freq_dict, edge_freq_dict)
-    
-    for string in topic_name.split('_'):
-        if string in graph.nodes:
-            graph.remove_node(string)
-    
-    stat_dict['nodes'] = len(graph.nodes)
-    stat_dict['edges'] = len(graph.edges)
-
-    #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.\t('+topic_name+')')
-    root_hub_list = root_hubs(graph, edge_freq_dict)
-    
-    #adds sense inventory to buffer with some common neighbors for context
-    stat_dict['hubs'] = dict()
-    
-    for root_hub in root_hub_list:
-    
-        by_frequency = lambda node: edge_freq_dict[root_hub,node] \
-                                        if root_hub < node \
-                                        else edge_freq_dict[node, root_hub]
-                                    
-        most_frequent_neighbor_list = sorted(graph.adj[root_hub],
-                                                key=by_frequency, reverse=True) 
-        
-        stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]

-    return graph, root_hub_list, stat_dict
+##############################
+# Propagation Disambiguation #
+##############################


-def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
-    """Colours graph accoring to root hubs.
+def label_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
+    """propagations graph accoring to root hubs.
    
-    Evolving network that colours neighboring nodes iterative. See sentiment
+    Evolving network that propagations neighboring nodes iterative. See sentiment
    propagation.
    
    Args:
@@ -560,7 +530,7 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
        root_hub_list: List of senses.
        
    Returns:
-        Coloured graph.
+        labelled graph.
    """
    
    
@@ -579,7 +549,7 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
            
            graph.node[node]['sense'] = None
    
-    max_iteration_count = config.max_colour_iteration_count
+    max_iteration_count = config.max_propagation_iteration_count
    
    iteration_count = 0
    stable = False
@@ -607,12 +577,12 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
                
                graph.node[node]['dist'].append(neighbor_weight_list)
                
-                old_colour = graph_copy.node[node]['sense']
-                new_colour = np.argmax(np.mean(graph.node[node]['dist'], axis=0))
+                old_propagation = graph_copy.node[node]['sense']
+                new_propagation = np.argmax(np.mean(graph.node[node]['dist'], axis=0))
                
-                if old_colour != new_colour:
+                if old_propagation != new_propagation:
                    stable = False
-                    graph.node[node]['sense'] = new_colour
+                    graph.node[node]['sense'] = new_propagation
                
                else:
                    pass
@@ -626,12 +596,12 @@ def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
        graph.node[node]['dist'] = np.mean(graph.node[node]['dist'], axis=0)
    
    return graph
+
+
+def disambiguate_propagation(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict:
+    """Clusters senses to root hubs using a labelled graph.
    
-    
-def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict:
-    """Clusters senses to root hubs using a coloured graph.
-    
-    This algorithm colours the graph using evolutionary graph theory
+    This algorithm propagations the graph using evolutionary graph theory
    and calculates scores for each root hub given a context based on this graph.
    
    Args:
@@ -643,7 +613,7 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
        A dictionary with root hub IDs as keys and context indices as values.
    """
    
-    coloured_graph = colour_graph(graph, root_hub_list)
+    labelled_graph = label_graph(graph, root_hub_list)
    
    mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)}
    
@@ -667,11 +637,11 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
            else:
                text = token.text
            
-            if text in coloured_graph.nodes:
+            if text in labelled_graph.nodes:
                
-                text_colour_dist = coloured_graph.node[text]['dist']
+                text_propagation_dist = labelled_graph.node[text]['dist']
                
-                if not any(text_colour_dist):
+                if not any(text_propagation_dist):
                    
                    pass
                
@@ -681,9 +651,9 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
                        
                        root_hub_idx = root_hub_list.index(root_hub)
                    
-                        if nx.has_path(coloured_graph , text, root_hub):
+                        if nx.has_path(labelled_graph , text, root_hub):
                            
-                            shortest_path = nx.shortest_path(coloured_graph ,
+                            shortest_path = nx.shortest_path(labelled_graph ,
                                                            text,
                                                            root_hub,
                                                            'weight')
@@ -693,10 +663,10 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
                            for i in range(1, len(shortest_path)):
                                sub_from, sub_to = shortest_path[i-1], shortest_path[i]
                                total_weight += \
-                                    coloured_graph[sub_from][sub_to]['weight']
+                                    labelled_graph[sub_from][sub_to]['weight']
                
                            score[root_hub_idx] += (1/(1+total_weight)) \
-                             * coloured_graph.node[text]['dist'][root_hub_idx]
+                             * labelled_graph.node[text]['dist'][root_hub_idx]
                    
                        else:
                            
@@ -717,6 +687,88 @@ def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list
    return mapping_dict


+
+##############################
+#     MST Disambiguation     #
+##############################
+
+
+def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
+    """Builds minimum spanning tree from graph and removes singletons.
+    
+    Applies components algorithm from Véronis (2004) and removes singletons.
+    
+    Args:
+        graph: Undirected weighted graph.
+        root_hub_list: List of strings of root hubs of graph.
+        target_string: Root of minimum spanning tree.
+        
+    Returns:
+        Minimum spanning tree with target as root and root hubs as direct
+            children. Singletons removed.
+    """
+    
+    graph_copy = deepcopy(graph)
+    
+    graph_copy.add_node(target_string)
+    for root_hub in root_hub_list:
+        graph_copy.add_edge(target_string,root_hub,weight=0)
+        
+    minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
+    
+    return minimum_spanning_tree
+
+
+def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
+    """Calculate score for a given component in a minimum spanning tree.
+    
+    First the correct root for the component is chosen. If no root hub is
+    suitable, an empty array is returned. A score is calculated for the distance
+    of the component and its root and returned as part of an array filled with
+    zeroes.
+    
+    Args:
+        graph: Minimum spanning tree.
+        component: Node (string) from which the distances are to be calculated.
+        root_hub_list: List of strings of root hubs (senses) of original graph.
+    
+    Returns:
+        Array with one score for the correct root hub and filled with zeroes.
+    """
+    
+    root_hub_count = len(root_hub_list)
+    
+    #Initialise score array.
+    score_array = np.zeros(root_hub_count)
+    
+    # Find root of component.
+    distance_list = list()
+    for root_hub in root_hub_list:
+        if nx.has_path(graph, component, root_hub):
+            distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
+        else:
+            distance_list.append(0)
+            
+    if sum(distance_list) == 0:
+        return score_array
+            
+    root_idx = np.argmax(distance_list)
+    root = root_hub_list[root_idx]
+    
+    shortest_path = nx.shortest_path(graph, component, root, 'weight')
+    total_weight = 0
+
+    # Add weights of every sub-path.
+    for i in range(1, len(shortest_path)):
+        sub_from, sub_to = shortest_path[i-1], shortest_path[i]
+        total_weight += graph[sub_from][sub_to]['weight']
+
+    score_array = np.zeros(root_hub_count)
+    score_array[root_idx] = 1/(1+total_weight)
+    
+    return score_array
+    
+
 def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
                 context_list: list, topic_name: str) -> dict:
    """Matches contexts to senses.
@@ -804,53 +856,11 @@ def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
    return mapping_dict


-def print_stats(stat_dict: dict) -> None:
-    """Prints various statistics and logs them to file.
-    
-    Args:
-        stat_dict: Dictionary with various statistics.
-    
-    """
-    
-    stat_string = []
-    
-    ts = time.gmtime()
-    
-    key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain']
-    
-    stat_string.append('Topic: {}.'.format(stat_dict['target']))
-    stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts)))
-    stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges']))
-    stat_string.append('Characteristic path length: {}.'.format(stat_dict['L']))
-    stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C']))
-    stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size']))
-    stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size']))
-    stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters']))
-    stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain']))
-    stat_string.append('Sense inventory:')
-    for hub in stat_dict['hubs'].keys():
-        stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub])))
-    
-    print('\n[A] '+'\n[A] '.join(stat_string)+'\n')
-    
-    with open('statistics.txt', 'a') as stat_file:
-        
-        stat_file.write('\n '.join(stat_string)+'\n\n')
-    
-    write_header = not os.path.exists('.statistics.tsv')
-    
-    with open('.statistics.tsv', 'a') as stat_file:
-        
-        if write_header:
-            
-            stat_file.write('\t'.join(key_list)+'\n')
-            
-        stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n')
-        
-    
-        
-            
-        
+
+##############################
+#          Statistics        #
+##############################
+

 def global_clustering_coefficient(graph: nx.Graph) -> float:
    """Calculates global clustering coefficient from graph.
@@ -918,6 +928,56 @@ def characteristic_path_length(graph: nx.Graph) -> float:
    return np.mean(path_length_list)


+def print_stats(stat_dict: dict) -> None:
+    """Prints various statistics and logs them to file.
+    
+    Args:
+        stat_dict: Dictionary with various statistics.
+    
+    """
+    
+    stat_string = []
+    
+    ts = time.gmtime()
+    
+    key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain']
+    
+    stat_string.append('Topic: {}.'.format(stat_dict['target']))
+    stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts)))
+    stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges']))
+    stat_string.append('Characteristic path length: {}.'.format(stat_dict['L']))
+    stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C']))
+    stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size']))
+    stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size']))
+    stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters']))
+    stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain']))
+    stat_string.append('Sense inventory:')
+    for hub in stat_dict['hubs'].keys():
+        stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub])))
+    
+    print('\n[A] '+'\n[A] '.join(stat_string)+'\n')
+    
+    with open('statistics.txt', 'a') as stat_file:
+        
+        stat_file.write('\n '.join(stat_string)+'\n\n')
+    
+    write_header = not os.path.exists('.statistics.tsv')
+    
+    with open('.statistics.tsv', 'a') as stat_file:
+        
+        if write_header:
+            
+            stat_file.write('\t'.join(key_list)+'\n')
+            
+        stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n')
+        
+    
+
+##############################
+#            main            #
+##############################
+
+
 def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
    """Calls induction and disambiguation functions, performs main task.

@@ -955,7 +1015,7 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
        stat_dict['C_rand'] = 2 * mean_degree/node_count
        
        
-        colour_rank = config.colour_rank
+        propagation_rank = config.colour_rank
        mst_rank = config.mst_rank
        
        #Merges Mappings according to pipeline
@@ -963,10 +1023,10 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
        
        #matches senses to clusters
        print('[a]', 'Disambiguating results.\t('+topic_name+')')
-        if colour_rank != 0:
+        if propagation_rank != 0:
            
-            print('[a]', 'Colouring graph.\t('+topic_name+')')
-            mapping_dict[colour_rank] = disambiguate_colour(graph, root_hub_list,
+            print('[a]', 'Propagating through graph.\t('+topic_name+')')
+            mapping_dict[propagation_rank] = disambiguate_propagation(graph, root_hub_list,
                                                            result_dict[topic_id])
            
        if mst_rank != 0: