Added Support for Evolutionary Graph Clustering.

e96d083d · Victor Zimmermann · c71dbe78 · e96d083d · e96d083d
Commit e96d083d authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -33,8 +33,8 @@ import pprint
 import random
 import re
 import spacy # for nlp
+
 from multiprocessing import Pool
-from nltk.corpus import stopwords
 from copy import deepcopy


@@ -201,7 +201,7 @@ def process_file(context_list: list, target_string: str,
    
    spaced_target_string = target_string.replace('_', ' ')
    
-    stopword_list = set(stopwords.words('english') + config.stop_words)
+    stopword_list = config.stop_words
    allowed_tag_list = config.allowed_tags
    min_context_size = config.min_context_size
        
@@ -227,7 +227,7 @@ def process_file(context_list: list, target_string: str,
                            pass
                        
                        # Do not add stop words to nodes.
-                        elif token.text in stopword_list:
+                        elif token.is_stop or token.text in stopword_list:
                            pass
                        
                        # Add only tokens with allowed tags to nodes.
@@ -548,8 +548,154 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):

        return graph, root_hub_list, stat_dict

+def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
+    """Colours graph accoring to root hubs.
+    
+    Evolving network that colours neighboring nodes iterative.
+    
+    Args:
+        graph: Weighted undirected graph.
+        root_hub_list: List of senses.
+        
+    Returns:
+        Coloured graph.
+    """
+    
+    
+    for node in graph.nodes:
+        if node in root_hub_list:
+            graph.node[node]['sense'] = root_hub_list.index(node)
+        else:
+            graph.node[node]['sense'] = None
+    
+    max_iteration_count = config.max_colour_iteration_count
+    
+    iteration_count = 0
+    stable = False
+    while stable == False and iteration_count <= max_iteration_count:
+        
+        graph_copy = deepcopy(graph)
+        iteration_count += 1
+        stable = True
+        
+        for node in graph.nodes:
+            
+            neighbor_weight_list = [0] * len(root_hub_list)
+            
+            for neighbor in graph_copy[node]:
+                
+                if graph_copy.node[neighbor]['sense'] == None:
+                    pass
+                else:
+                    neighbor_weight_list[graph_copy.node[neighbor]['sense']] \
+                        += 1 - graph_copy[node][neighbor]['weight']
+            
+            if any(neighbor_weight_list):
+                
+                old_colour = graph_copy.node[node]['sense']
+                new_colour = np.argmax(neighbor_weight_list)
+                
+                if old_colour != new_colour:
+                    stable = False
+                    graph.node[node]['sense'] = new_colour
+                
+                else:
+                    pass
+                
+            else:
+                
+                pass
+    
+    return graph
+    
+def disambiguate_colour(graph: nx.Graph, root_hub_list, context_list: list) -> dict:
+    """Clusters senses to root hubs using a coloured graph.
+    
+    This algorithm colours the graph using (a method with a name i don't know)
+    and calculates scores for each root hub given a context based on this graph.
+    
+    Args:
+        graph: Undirected weighted graph.
+        root_hub_list: List of root hubs (senses).
+        context_list: List of search result strings to be clustered.
+        
+    Returns:
+        A dictionary with root hub IDs as keys and context indices as values.
+    """
+    
+    coloured_graph = colour_graph(graph, root_hub_list)
+    
+    mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)}
+    
+    if len(root_hub_list) == 0:
+        
+        mapping_dict = {0:[i for i in range(1, len(context_list)+1)]}
+        
+        return mapping_dict
+    
+    context_id = 0
+    for context in context_list:
+        
+        context_id += 1
+        score = [0]*len(root_hub_list)
+        parsed_context = nlp(context)
+        
+        for token in parsed_context:
+            
+            if config.lemma == True:
+                text = token.lemma_
+            else:
+                text = token.text
+            
+            if text in coloured_graph.nodes:
+                
+                text_colour = coloured_graph.node[text]['sense']
+                
+                if text_colour == None:
+                    
+                    pass
+                
+                else:
+                
+                    text_root = root_hub_list[text_colour]
+                    
+                    if nx.has_path(coloured_graph , text, text_root):
+                        
+                    
+                        shortest_path = nx.shortest_path(coloured_graph ,
+                                                        text,
+                                                        root_hub_list[text_colour],
+                                                        'weight')
+                        total_weight = 0
+
+                        # Add weights of every sub-path.
+                        for i in range(1, len(shortest_path)):
+                            sub_from, sub_to = shortest_path[i-1], shortest_path[i]
+                            total_weight += \
+                                coloured_graph [sub_from][sub_to]['weight']

-def disambiguate(graph: nx.Graph, root_hub_list: list,
+            
+                        score[text_colour] += 1/(1+total_weight)
+                
+                    else:
+                        pass
+            
+            else:
+                pass
+    
+        if any(score):
+            
+            mapping_dict[np.argmax(score)+1].append(context_id)
+        
+        else:
+            
+            pass
+        
+        
+    return mapping_dict
+
+
+def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
                 context_list: list, topic_name: str) -> dict:
    """Matches contexts to senses.
    
@@ -568,7 +714,6 @@ def disambiguate(graph: nx.Graph, root_hub_list: list,
    """
    
    #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
    minimum_spanning_tree = components(graph, root_hub_list, topic_name)
    
    spaced_topic_name = topic_name.replace('_', ' ')
@@ -581,7 +726,9 @@ def disambiguate(graph: nx.Graph, root_hub_list: list,
    #if no sense is found for a target word, we should assume that there only is one sense
    if len(root_hub_list) == 0: 
            
-        return {0:[i for i in range(1, len(context_list)+1)]}
+        mapping_dict = {0:[i for i in range(1, len(context_list)+1)]}
+        
+        return mapping_dict
    
    idx = 0
    
@@ -639,8 +786,8 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
    """Calls induction and disambiguation functions, performs main task.

    The task is to both induce senses and match search results to them. This
-    function calls in much the same way induce() and disambiguate() to perform 
-    these sub tasks. The result is then written to the output directory
+    function calls in much the same way induce() and disambiguate_mst() to 
+    perform these sub tasks. The result is then written to the output directory
    specified in config.py.
    
    Args:
@@ -657,8 +804,15 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
    
    #matches senses to clusters
    print('[a]', 'Disambiguating result_list.\t('+topic_name+')')
-    mapping_dict = disambiguate(graph, root_hub_list,
-                                result_dict[topic_id], topic_name)
+    if config.use_colouring == True:
+        print('[a]', 'Colouring graph.\t('+topic_name+')')
+        mapping_dict = disambiguate_colour(graph, root_hub_list,
+                                           result_dict[topic_id])
+    else:
+        
+        print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
+        mapping_dict = disambiguate_mst(graph, root_hub_list,
+                                        result_dict[topic_id], topic_name)
    
    #collect statistics from result.
    cluster_count = 0
@@ -696,6 +850,13 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:


 if __name__ == '__main__':
+    """Check for modifiers and call main().
+    
+    Only called when absinth.py is started manually. Checks for various
+    modifiers, i.e. test environment and number of processes to run 
+    simultaneously.
+    """
+    
    # If absinth.py is run in test environment.
    if '-t' in sys.argv:
        data_path = config.test
@@ -714,6 +875,3 @@ if __name__ == '__main__':
        parameter_list = [(topic_id, topic_name, result_dict)
                          for topic_id,topic_name in topic_dict.items()]
        pool.starmap(main, parameter_list)
-        
-    #for topic_id,topic_name in topics.items():
-       #word_sense_induction(topic_id,topic_name, results)
--- a/src/config.py
+++ b/src/config.py
@@ -43,10 +43,16 @@ max_weight = 0.9
 Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
 - the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
 '''
-min_neighbors = 5
+min_neighbors = 4
 threshold = 0.8

 '''
 Choose whether or not the tokens should be lemmatised.
 '''
-lemma = True
+lemma = False
+
+'''
+colouring options
+'''
+use_colouring = True
+max_colour_iteration_count = 50