From 565b5233e600f293f419db2f364ff99e13a4093d Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Fri, 16 Mar 2018 17:47:16 +0100
Subject: [PATCH] Reimplement components() + more comment reform.

---
 src/absinth.py | 306 +++++++++++++++++++++++++++++++------------------
 1 file changed, 194 insertions(+), 112 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index 7dcf530..a347bd1 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -44,7 +44,7 @@ def frequencies(target_string, search_result_list):
     
     bracketed_target_string = '('+target_string+')'
     
-    # Remove unnecessary tokens from snippets 
+    # Remove unnecessary tokens from snippets.
     _search_result_list = list()
     for r in search_result_list:
         r = r.replace('<b>', '')
@@ -53,13 +53,12 @@ def frequencies(target_string, search_result_list):
         r = r.strip()
         _search_result_list.append(r)
     
-    #initialises frequencies with counts from results
+    # Initialise frequencies with counts from results.
     node_freq_dict, edge_freq_dict = process_file(_search_result_list,
                                                   target_string,
                                                   dict(),
                                                   dict()) 
     
-    #names of corpus files
     corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)]
     corpus_size = len(corpus_file_path_list)
     
@@ -69,7 +68,7 @@ def frequencies(target_string, search_result_list):
         node_count = len(node_freq_dict)
         edge_count = len(edge_freq_dict)
         
-        #prints update after every 11th of the corpus is parsed
+        # Print update after every 11th of the corpus is parsed.
         if processed_file_count % int(corpus_size/11) == 0: 
             
             file_ratio = processed_file_count / corpus_size
@@ -78,7 +77,7 @@ def frequencies(target_string, search_result_list):
             
             ratios = [file_ratio, max_node_ratio, max_edge_ratio]
             
-            #uses the ratio closest to 100%.
+            # Use ratio closest to 100%.
             highest_ratio = int((max(ratios))*100)
             
             print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio,
@@ -86,7 +85,6 @@ def frequencies(target_string, search_result_list):
                                                                    edge_count,
                                                                    bracketed_target_string))
         
-        #checks maximum node values
         if node_count > max_node_count:
             print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
                                                                edge_count,
@@ -148,11 +146,11 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
         for context in context_list:
             
             context = context.lower()
-            if spaced_target_string in context: #greedy pre selection, not perfect
+            if spaced_target_string in context: # Pre-select lines greedy.
                 
-                token_set = set() #set of node candidates
+                token_set = set()
                 
-                #This replacement allows target to be treated as single entity.
+                # Allow target to be treated as single entity.
                 context = context.replace(spaced_target_string, target_string)
                 processed_context = nlp(context)
                 
@@ -160,15 +158,15 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                     
                     for token in processed_context:
                         
-                        #doesn't add target word to nodes
+                        # Do not add target word to nodes.
                         if token.text == target_string:
                             pass
                         
-                        #doesn't add stop words to nodes
+                        # Do not add stop words to nodes.
                         elif token.text in stopword_list:
                             pass
                         
-                        #only adds tokens with allowed tags to nodes
+                        # Add only tokens with allowed tags to nodes.
                         elif token.tag_ in allowed_tag_list:
                             token_set.add(token.text)
                             
@@ -190,14 +188,14 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                             else:
                                 edge_freq_dict[edge] = 1
     
-    #if a file is corrupted (can't always be catched with if-else)
+    # If file is corrupted (can't always be catched with if-else), ignore file.
     except UnicodeDecodeError:
         
         pass            
     
     return node_freq_dict, edge_freq_dict
 
-#build graph from frequency dictionaries
+
 def build_graph(node_freq_dict, edge_freq_dict):
     """Builds undirected weighted graph from dictionaries.
     
@@ -221,13 +219,11 @@ def build_graph(node_freq_dict, edge_freq_dict):
     
     cooccurence_graph = nx.Graph()
     
-    #node : node frequency
     for node, frequency in node_freq_dict.items():
         
         if frequency >= min_node_freq:
             cooccurence_graph.add_node(node)
             
-    #edge : edge frequency
     for node_tuple, frequency in edge_freq_dict.items():
         
         if frequency < min_edge_freq:
@@ -265,166 +261,246 @@ def build_graph(node_freq_dict, edge_freq_dict):
     return cooccurence_graph
 
 
-#Identifies senses by choosing nodes with high degrees
-def root_hubs(graph, edge_freq_dict, min_neighbors=4, theshold=0.8):
+def root_hubs(graph, edge_freq_dict):
+    """Identifies senses (root hubs) by choosing nodes with high degrees
+    
+    Selects root hubs according to the algorithm in Véronis (2004). Nodes with
+    high degree and neighbors with low weights (high cooccurence) are chosen
+    until there are no more viable candidates. A root hub candidate is every
+    node that is not already a hub and is not a neighbor of one.
+    
+    Args:
+        graph: Weighted undirected graph.
+        edge_freq_dict: Dictionary of weights for every tuple in our graph.
+        
+    Returns:
+        hub_list: List of root hubs, i.e. strings that are selected using the
+            algorithm explained above.
+    """
     
     min_neighbors = config.min_neighbors
     threshold = config.threshold
     
-    G = deepcopy(graph)
-    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # sorts according to degree
-    H = list() #output list
+    # Allow operations on graph without altering original one.
+    graph_copy = deepcopy(graph)
+    
+    # Sort according to degree (number of neighbors).
+    candidate_list = sorted(graph_copy.nodes,
+                            key=lambda node: graph_copy.degree[node],
+                            reverse=True)
+    
+    hub_list = list()
     
-    while V:
+    # While there are still candidates, search for root hubs.
+    while candidate_list:
         
-        v = V[0] #best hub candidate
+        candidate = candidate_list[0] #best hub candidate
         
-        if G.degree[v] >= min_neighbors:
+        if graph_copy.degree[candidate] >= min_neighbors:
         
-            mfn = sorted(G.adj[v], key=lambda key: edge_freq_dict[v,key] if v < key else edge_freq_dict[key, v], reverse=True)[:min_neighbors] #most frequent neighbors
+            by_frequency = lambda node: edge_freq_dict[candidate,node] \
+                                         if candidate < node \
+                                         else edge_freq_dict[node,candidate]
+                              
+            most_frequent_neighbor_list = sorted(graph_copy.adj[candidate],
+                                                 key=by_frequency,
+                                                 reverse=True) [:min_neighbors] 
             
-            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold
+            # If the mean weight of the most frequent neighbors cooccur
+            # frequently enough with candidate, the candidate is approved.
+            if np.mean([graph_copy.edges[candidate,node]['weight'] 
+                        for node in most_frequent_neighbor_list])  < threshold: 
                 
-                H.append(v)
+                # Add candidate as root hub.
+                hub_list.append(candidate)
             
-                #removes neighbors of new hub as hub candidates
-                for nbr in deepcopy(G).adj[v]:
+                # Remove neighbors of new hub as hub candidates.
+                for neighbor in deepcopy(graph_copy).adj[candidate]:
+                    graph_copy.remove_node(neighbor)
                 
-                    G.remove_node(nbr)
-                
-            #removes hub candidate
-            G.remove_node(v)
+            # Remove hub candidate.
+            graph_copy.remove_node(candidate)
             
-            #reorderd potential hubs after deletions
-            V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True)
+            # Reorder potential hubs after deletions.
+            candidate_list = sorted(graph_copy.nodes,
+                                    key=lambda node: graph_copy.degree[node],
+                                    reverse=True)
         
         else:
         
-            return H
+            return hub_list
     
-    return H
+    return hub_list
 
 
-#Components algorithm from Véronis (2004), converts graph for target into a MST
-def components(graph, hubs, target_string):
+def components(graph, root_hub_list, target_string):
+    """Builds minimum spanning tree from graph and removes singletons.
     
-    G = deepcopy(graph)
-    H = hubs #root hubs
-    t = target_string
+    Applies components algorithm from Véronis (2004) and removes singletons.
     
-    #G.add_node(t)
-    #for h in H:
-        #G.add_edge(t,h,weight=0)
+    Args:
+        graph: Undirected weighted graph.
+        root_hub_list: List of strings of root hubs of graph.
+        target_string: Root of minimum spanning tree.
         
-    T = nx.minimum_spanning_tree(G)
+    Returns:
+        minimum_spanning_tree: Minimum spanning tree with target as
+            root and root hubs as direct children. Singletons removed.
+    """
+    
+    graph_copy = deepcopy(graph)
+    
+    graph_copy.add_node(target_string)
+    for root_hub in root_hub_list:
+        graph_copy.add_edge(target_string,root_hub,weight=0)
+        
+    minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
     
-    #removes singletons
-    for node in deepcopy(T).nodes:
-        if len(T.adj[node]) == 0:
-            T.remove_node(node)
+    # Remove singletons, deepcopy for iteration while being altered.
+    for node in deepcopy(minimum_spanning_tree).nodes:
+        if len(minimum_spanning_tree.adj[node]) == 0:
+            minimum_spanning_tree.remove_node(node)
     
-    return T
+    return minimum_spanning_tree
 
 
-#Calculates score for a given path in a minimum spanning tree
-def score(graph, from_node, to_node):
+def score(graph, component, root_hub_list):
+    """Calculate score for a given component in a minimum spanning tree.
     
-    #if correct tree
-    if nx.has_path(graph, from_node, to_node):
-                
-        # calculates shortest path (approximation for path with lowest total weight)
-        path = nx.shortest_path(graph, from_node, to_node, 'weight')
-        total_weight = 0
+    First the correct root for the component is chosen. If no root hub is
+    suitable, an empty array is returned. A score is calculated for the distance
+    of the component and its root and returned as part of an array filled with
+    zeroes.
     
-        #adds weights of every sub-path
-        for i in range(1, len(path)):
-            sub_from, sub_to = path[i-1], path[i]
-            total_weight += graph[sub_from][sub_to]['weight']
+    Args:
+        graph: Minimum spanning tree.
+        component: Node (string) from which the distances are to be calculated.
+        root_hub_list: List of strings of root hubs (senses) of original graph.
     
-        #the further the path, the lower the score
-        return 1/(1+total_weight)
-        
-    else:
-        
-        return 0
+    Returns:
+        score_array: Array with one score for the correct root hub and filled 
+            with zeroes..
+    """
+    
+    root_hub_count = len(root_hub_list)
+    
+    #Initialise score array.
+    score_array = np.zeros(root_hub_count)
+    
+    # Find root of component.
+    distance_list = list()
+    for root_hub in root_hub_list:
+        if nx.has_path(graph, component, root_hub):
+            distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
+        else:
+            distance_list.append(0)
+            
+    if sum(distance_list) == 0:
+        return score_array
+            
+    root_idx = np.argmax(distance_list)
+    root = root_hub_list[root_idx]
+    
+    shortest_path = nx.shortest_path(graph, component, root, 'weight')
+    total_weight = 0
+
+    # Add weights of every sub-path.
+    for i in range(1, len(shortest_path)):
+        sub_from, sub_to = shortest_path[i-1], shortest_path[i]
+        total_weight += graph[sub_from][sub_to]['weight']
 
+    score_array = np.zeros(root_hub_count)
+    score_array[root_idx] = 1/(1+total_weight)
+    
+    return score_array
 
-# Basically Word Sense Disambiguation, matches context to sense
-def disambiguate(mst, hubs, contexts, target_string):
+
+def disambiguate(minimum_spanning_tree, root_hub_list,
+                 context_list, target_string):
+    """Matches contexts to senses.
+    
+    Adds up scores for each token in a context string and matches the context
+    to the root hub with the highest score.
+    
+    Args:
+        minimum_spanning_tree: Minimum spanning tree with target as root.
+        root_hub_list: List of strings of root hubs (senses).
+        context_list: List of sentence strings that are to be clustered.
+        target_string: String of target word, also root of MST.
+    
+    Returns:
+        mapping_dict: Dictionary of root hubs (senses) as keys and context ids
+            as values.
+    """
     
     target_string = target_string.replace('_', ' ')
-    T = mst #minimum spanning tree
-    H = hubs #root hubs
-    C = [c.lower().strip().replace(target_string, '') for c in contexts] #cleaned up contexts
+    context_list = [context.lower().strip().replace(target_string, '')
+                    for context in context_list]
     
     score_dict = dict() #memoisation for scores
-    mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
+    mapping_dict = {topic:[] for topic in range(1,len(root_hub_list)+1)}
     
     #if no sense is found for a target word, we should assume that there only is one sense
-    if len(H) == 0: 
+    if len(root_hub_list) == 0: 
             
-        return {0:[i for i in range(1, len(C)+1)]}
+        return {0:[i for i in range(1, len(context_list)+1)]}
     
     idx = 0
     
-    for c in C:
+    for context in context_list:
         
         idx += 1 #index based on position in list
     
-        doc = nlp(c) #parsed context
-        texts = [tok.text for tok in doc] #tokens
+        processed_context = nlp(context)
+        text_list = [token.text for token in processed_context] #tokens
         
-        scores = np.zeros(len(H)) #initialise with zeros for every sense
+        score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
         
-        for text in texts:
+        for text in text_list:
             
-            if text in T.nodes: #if word wasn't filtered out
-                
-                new_scores = list() #scores to be added to total scores
+            if text in minimum_spanning_tree.nodes: #if word wasn't filtered out
                 
-                for h in H: #for each hub
-                    
-                    if (text, h) in score_dict: #memoisation
+                if text in score_dict: #memoisation
                         
-                        new_scores.append(score_dict[(text,h)])
+                    new_scores = score_dict[text]
+                
+                else:
                     
-                    else:
-                        
-                        new_score = score(T, text, h)
-                        new_scores.append(new_score)
-                        score_dict[(text,h)] = new_score #memoisation
+                    new_score = score(minimum_spanning_tree, 
+                                      text, root_hub_list)
+                    score_dict[text] = new_score #memoisation
                     
-                scores = scores + np.array(new_scores)
+                score_array += new_score
             
             else:
             
                 pass
         
-        #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
-        if np.max(scores) == 0:
+        # If disambiguator does not detect a sense, return singleton.
+        if np.max(score_array) == 0:
             
             pass
             
         else:
                 
-            #applies sense with the highest score to context
-            max_score = np.max(scores)
-            argmax_score = np.argmax(scores)
+            # Apply sense with the highest score to context
+            max_score = np.max(score_array)
+            argmax_score = np.argmax(score_array)
             
-            #clusters begin at 1
+            # Clusters begin at 1
             mapping_dict[argmax_score + 1].append(idx)
 
     return mapping_dict
 
 
 # our main function, here the main stepps for word sense induction are called
-def word_sense_induction(topic_id, topic_name, results):
+def word_sense_induction(topic_id, topic_name, result_list):
     
     #buffer for useful information
     out_buffer = '\n'
     
     #path for output(directory)
-    output_path = './test/'#config.output
+    output_path = config.output
             
     #removes trailing new_lines
     old_target_string = topic_name.strip() #original target
@@ -449,7 +525,7 @@ def word_sense_induction(topic_id, topic_name, results):
     
     #counts occurences of single words, as well as cooccurrences, saves it in dictionary
     print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, results[topic_id])
+    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
     
     #builds graph from these dictionaries, also applies multiple filters
     print('[a]', 'Building graph.\t('+old_target_string+')')
@@ -474,20 +550,20 @@ def word_sense_induction(topic_id, topic_name, results):
     T = components(G, H, target_string)
 
     #matches senses to clusters
-    print('[a]', 'Disambiguating results.\t('+old_target_string+')')
-    D = disambiguate(T, H, results[topic_id], target_string)
+    print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
+    D = disambiguate(T, H, result_list[topic_id], target_string)
     
     out_buffer += ('[A] Mapping: \n')
-    for cluster,results in D.items():
-        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
+    for cluster,result_list in D.items():
+        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list])))
     
     #prints buffer
     print('[a]', 'Writing to file.\t('+old_target_string+')')
     print(out_buffer)
     
     #writes clustering to file
-    for cluster,results in D.items():
-        for result in results:
+    for cluster,result_list in D.items():
+        for result in result_list:
             f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
         
     f.close()
@@ -526,7 +602,7 @@ def read_dataset(data_path):
 
 def main():
     
-    # If absinth.py is run in test environment
+    # If absinth.py is run in test environment.
     if '-t' in sys.argv:
         data_path = config.test
     else:
@@ -534,7 +610,13 @@ def main():
         
     results, topics = read_dataset(data_path)
     
-    with Pool(2) as pool:
+    # Enables manual setting of process count.
+    if '-p' in sys.argv:
+        process_count = int(sys.argv[sys.argv.index('-p') + 1])
+    else:
+        process_count = 1
+    
+    with Pool(process_count) as pool:
         parameter_list = [(topic_id, topic_name, results)
                           for topic_id,topic_name in topics.items()]
         pool.starmap(word_sense_induction, parameter_list)
-- 
GitLab