Reimplement components() + more comment reform.

565b5233 · Victor Zimmermann · 657eb8e5 · 565b5233
Commit 565b5233 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -44,7 +44,7 @@ def frequencies(target_string, search_result_list):
    
    bracketed_target_string = '('+target_string+')'
    
-    # Remove unnecessary tokens from snippets 
+    # Remove unnecessary tokens from snippets.
    _search_result_list = list()
    for r in search_result_list:
        r = r.replace('<b>', '')
@@ -53,13 +53,12 @@ def frequencies(target_string, search_result_list):
        r = r.strip()
        _search_result_list.append(r)
    
-    #initialises frequencies with counts from results
+    # Initialise frequencies with counts from results.
    node_freq_dict, edge_freq_dict = process_file(_search_result_list,
                                                  target_string,
                                                  dict(),
                                                  dict()) 
    
-    #names of corpus files
    corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)]
    corpus_size = len(corpus_file_path_list)
    
@@ -69,7 +68,7 @@ def frequencies(target_string, search_result_list):
        node_count = len(node_freq_dict)
        edge_count = len(edge_freq_dict)
        
-        #prints update after every 11th of the corpus is parsed
+        # Print update after every 11th of the corpus is parsed.
        if processed_file_count % int(corpus_size/11) == 0: 
            
            file_ratio = processed_file_count / corpus_size
@@ -78,7 +77,7 @@ def frequencies(target_string, search_result_list):
            
            ratios = [file_ratio, max_node_ratio, max_edge_ratio]
            
-            #uses the ratio closest to 100%.
+            # Use ratio closest to 100%.
            highest_ratio = int((max(ratios))*100)
            
            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio,
@@ -86,7 +85,6 @@ def frequencies(target_string, search_result_list):
                                                                   edge_count,
                                                                   bracketed_target_string))
        
-        #checks maximum node values
        if node_count > max_node_count:
            print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
                                                               edge_count,
@@ -148,11 +146,11 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
        for context in context_list:
            
            context = context.lower()
-            if spaced_target_string in context: #greedy pre selection, not perfect
+            if spaced_target_string in context: # Pre-select lines greedy.
                
-                token_set = set() #set of node candidates
+                token_set = set()
                
-                #This replacement allows target to be treated as single entity.
+                # Allow target to be treated as single entity.
                context = context.replace(spaced_target_string, target_string)
                processed_context = nlp(context)
                
@@ -160,15 +158,15 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                    
                    for token in processed_context:
                        
-                        #doesn't add target word to nodes
+                        # Do not add target word to nodes.
                        if token.text == target_string:
                            pass
                        
-                        #doesn't add stop words to nodes
+                        # Do not add stop words to nodes.
                        elif token.text in stopword_list:
                            pass
                        
-                        #only adds tokens with allowed tags to nodes
+                        # Add only tokens with allowed tags to nodes.
                        elif token.tag_ in allowed_tag_list:
                            token_set.add(token.text)
                            
@@ -190,14 +188,14 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                            else:
                                edge_freq_dict[edge] = 1
    
-    #if a file is corrupted (can't always be catched with if-else)
+    # If file is corrupted (can't always be catched with if-else), ignore file.
    except UnicodeDecodeError:
        
        pass            
    
    return node_freq_dict, edge_freq_dict

-#build graph from frequency dictionaries
+
 def build_graph(node_freq_dict, edge_freq_dict):
    """Builds undirected weighted graph from dictionaries.
    
@@ -221,13 +219,11 @@ def build_graph(node_freq_dict, edge_freq_dict):
    
    cooccurence_graph = nx.Graph()
    
-    #node : node frequency
    for node, frequency in node_freq_dict.items():
        
        if frequency >= min_node_freq:
            cooccurence_graph.add_node(node)
            
-    #edge : edge frequency
    for node_tuple, frequency in edge_freq_dict.items():
        
        if frequency < min_edge_freq:
@@ -265,166 +261,246 @@ def build_graph(node_freq_dict, edge_freq_dict):
    return cooccurence_graph


-#Identifies senses by choosing nodes with high degrees
-def root_hubs(graph, edge_freq_dict, min_neighbors=4, theshold=0.8):
+def root_hubs(graph, edge_freq_dict):
+    """Identifies senses (root hubs) by choosing nodes with high degrees
+    
+    Selects root hubs according to the algorithm in Véronis (2004). Nodes with
+    high degree and neighbors with low weights (high cooccurence) are chosen
+    until there are no more viable candidates. A root hub candidate is every
+    node that is not already a hub and is not a neighbor of one.
+    
+    Args:
+        graph: Weighted undirected graph.
+        edge_freq_dict: Dictionary of weights for every tuple in our graph.
+        
+    Returns:
+        hub_list: List of root hubs, i.e. strings that are selected using the
+            algorithm explained above.
+    """
    
    min_neighbors = config.min_neighbors
    threshold = config.threshold
    
-    G = deepcopy(graph)
-    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # sorts according to degree
-    H = list() #output list
+    # Allow operations on graph without altering original one.
+    graph_copy = deepcopy(graph)
+    
+    # Sort according to degree (number of neighbors).
+    candidate_list = sorted(graph_copy.nodes,
+                            key=lambda node: graph_copy.degree[node],
+                            reverse=True)
+    
+    hub_list = list()
    
-    while V:
+    # While there are still candidates, search for root hubs.
+    while candidate_list:
        
-        v = V[0] #best hub candidate
+        candidate = candidate_list[0] #best hub candidate
        
-        if G.degree[v] >= min_neighbors:
+        if graph_copy.degree[candidate] >= min_neighbors:
        
-            mfn = sorted(G.adj[v], key=lambda key: edge_freq_dict[v,key] if v < key else edge_freq_dict[key, v], reverse=True)[:min_neighbors] #most frequent neighbors
+            by_frequency = lambda node: edge_freq_dict[candidate,node] \
+                                         if candidate < node \
+                                         else edge_freq_dict[node,candidate]
+                              
+            most_frequent_neighbor_list = sorted(graph_copy.adj[candidate],
+                                                 key=by_frequency,
+                                                 reverse=True) [:min_neighbors] 
            
-            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold
+            # If the mean weight of the most frequent neighbors cooccur
+            # frequently enough with candidate, the candidate is approved.
+            if np.mean([graph_copy.edges[candidate,node]['weight'] 
+                        for node in most_frequent_neighbor_list])  < threshold: 
                
-                H.append(v)
+                # Add candidate as root hub.
+                hub_list.append(candidate)
            
-                #removes neighbors of new hub as hub candidates
-                for nbr in deepcopy(G).adj[v]:
+                # Remove neighbors of new hub as hub candidates.
+                for neighbor in deepcopy(graph_copy).adj[candidate]:
+                    graph_copy.remove_node(neighbor)
                
-                    G.remove_node(nbr)
-                
-            #removes hub candidate
-            G.remove_node(v)
+            # Remove hub candidate.
+            graph_copy.remove_node(candidate)
            
-            #reorderd potential hubs after deletions
-            V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True)
+            # Reorder potential hubs after deletions.
+            candidate_list = sorted(graph_copy.nodes,
+                                    key=lambda node: graph_copy.degree[node],
+                                    reverse=True)
        
        else:
        
-            return H
+            return hub_list
    
-    return H
+    return hub_list


-#Components algorithm from Véronis (2004), converts graph for target into a MST
-def components(graph, hubs, target_string):
+def components(graph, root_hub_list, target_string):
+    """Builds minimum spanning tree from graph and removes singletons.
    
-    G = deepcopy(graph)
-    H = hubs #root hubs
-    t = target_string
+    Applies components algorithm from Véronis (2004) and removes singletons.
    
-    #G.add_node(t)
-    #for h in H:
-        #G.add_edge(t,h,weight=0)
+    Args:
+        graph: Undirected weighted graph.
+        root_hub_list: List of strings of root hubs of graph.
+        target_string: Root of minimum spanning tree.
        
-    T = nx.minimum_spanning_tree(G)
+    Returns:
+        minimum_spanning_tree: Minimum spanning tree with target as
+            root and root hubs as direct children. Singletons removed.
+    """
+    
+    graph_copy = deepcopy(graph)
+    
+    graph_copy.add_node(target_string)
+    for root_hub in root_hub_list:
+        graph_copy.add_edge(target_string,root_hub,weight=0)
+        
+    minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
    
-    #removes singletons
-    for node in deepcopy(T).nodes:
-        if len(T.adj[node]) == 0:
-            T.remove_node(node)
+    # Remove singletons, deepcopy for iteration while being altered.
+    for node in deepcopy(minimum_spanning_tree).nodes:
+        if len(minimum_spanning_tree.adj[node]) == 0:
+            minimum_spanning_tree.remove_node(node)
    
-    return T
+    return minimum_spanning_tree


-#Calculates score for a given path in a minimum spanning tree
-def score(graph, from_node, to_node):
+def score(graph, component, root_hub_list):
+    """Calculate score for a given component in a minimum spanning tree.
    
-    #if correct tree
-    if nx.has_path(graph, from_node, to_node):
-                
-        # calculates shortest path (approximation for path with lowest total weight)
-        path = nx.shortest_path(graph, from_node, to_node, 'weight')
-        total_weight = 0
+    First the correct root for the component is chosen. If no root hub is
+    suitable, an empty array is returned. A score is calculated for the distance
+    of the component and its root and returned as part of an array filled with
+    zeroes.
    
-        #adds weights of every sub-path
-        for i in range(1, len(path)):
-            sub_from, sub_to = path[i-1], path[i]
-            total_weight += graph[sub_from][sub_to]['weight']
+    Args:
+        graph: Minimum spanning tree.
+        component: Node (string) from which the distances are to be calculated.
+        root_hub_list: List of strings of root hubs (senses) of original graph.
    
-        #the further the path, the lower the score
-        return 1/(1+total_weight)
-        
-    else:
-        
-        return 0
+    Returns:
+        score_array: Array with one score for the correct root hub and filled 
+            with zeroes..
+    """
+    
+    root_hub_count = len(root_hub_list)
+    
+    #Initialise score array.
+    score_array = np.zeros(root_hub_count)
+    
+    # Find root of component.
+    distance_list = list()
+    for root_hub in root_hub_list:
+        if nx.has_path(graph, component, root_hub):
+            distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
+        else:
+            distance_list.append(0)
+            
+    if sum(distance_list) == 0:
+        return score_array
+            
+    root_idx = np.argmax(distance_list)
+    root = root_hub_list[root_idx]
+    
+    shortest_path = nx.shortest_path(graph, component, root, 'weight')
+    total_weight = 0
+
+    # Add weights of every sub-path.
+    for i in range(1, len(shortest_path)):
+        sub_from, sub_to = shortest_path[i-1], shortest_path[i]
+        total_weight += graph[sub_from][sub_to]['weight']

+    score_array = np.zeros(root_hub_count)
+    score_array[root_idx] = 1/(1+total_weight)
+    
+    return score_array

-# Basically Word Sense Disambiguation, matches context to sense
-def disambiguate(mst, hubs, contexts, target_string):
+
+def disambiguate(minimum_spanning_tree, root_hub_list,
+                 context_list, target_string):
+    """Matches contexts to senses.
+    
+    Adds up scores for each token in a context string and matches the context
+    to the root hub with the highest score.
+    
+    Args:
+        minimum_spanning_tree: Minimum spanning tree with target as root.
+        root_hub_list: List of strings of root hubs (senses).
+        context_list: List of sentence strings that are to be clustered.
+        target_string: String of target word, also root of MST.
+    
+    Returns:
+        mapping_dict: Dictionary of root hubs (senses) as keys and context ids
+            as values.
+    """
    
    target_string = target_string.replace('_', ' ')
-    T = mst #minimum spanning tree
-    H = hubs #root hubs
-    C = [c.lower().strip().replace(target_string, '') for c in contexts] #cleaned up contexts
+    context_list = [context.lower().strip().replace(target_string, '')
+                    for context in context_list]
    
    score_dict = dict() #memoisation for scores
-    mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
+    mapping_dict = {topic:[] for topic in range(1,len(root_hub_list)+1)}
    
    #if no sense is found for a target word, we should assume that there only is one sense
-    if len(H) == 0: 
+    if len(root_hub_list) == 0: 
            
-        return {0:[i for i in range(1, len(C)+1)]}
+        return {0:[i for i in range(1, len(context_list)+1)]}
    
    idx = 0
    
-    for c in C:
+    for context in context_list:
        
        idx += 1 #index based on position in list
    
-        doc = nlp(c) #parsed context
-        texts = [tok.text for tok in doc] #tokens
+        processed_context = nlp(context)
+        text_list = [token.text for token in processed_context] #tokens
        
-        scores = np.zeros(len(H)) #initialise with zeros for every sense
+        score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
        
-        for text in texts:
+        for text in text_list:
            
-            if text in T.nodes: #if word wasn't filtered out
-                
-                new_scores = list() #scores to be added to total scores
+            if text in minimum_spanning_tree.nodes: #if word wasn't filtered out
                
-                for h in H: #for each hub
-                    
-                    if (text, h) in score_dict: #memoisation
+                if text in score_dict: #memoisation
                        
-                        new_scores.append(score_dict[(text,h)])
+                    new_scores = score_dict[text]
+                
+                else:
                    
-                    else:
-                        
-                        new_score = score(T, text, h)
-                        new_scores.append(new_score)
-                        score_dict[(text,h)] = new_score #memoisation
+                    new_score = score(minimum_spanning_tree, 
+                                      text, root_hub_list)
+                    score_dict[text] = new_score #memoisation
                    
-                scores = scores + np.array(new_scores)
+                score_array += new_score
            
            else:
            
                pass
        
-        #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
-        if np.max(scores) == 0:
+        # If disambiguator does not detect a sense, return singleton.
+        if np.max(score_array) == 0:
            
            pass
            
        else:
                
-            #applies sense with the highest score to context
-            max_score = np.max(scores)
-            argmax_score = np.argmax(scores)
+            # Apply sense with the highest score to context
+            max_score = np.max(score_array)
+            argmax_score = np.argmax(score_array)
            
-            #clusters begin at 1
+            # Clusters begin at 1
            mapping_dict[argmax_score + 1].append(idx)

    return mapping_dict


 # our main function, here the main stepps for word sense induction are called
-def word_sense_induction(topic_id, topic_name, results):
+def word_sense_induction(topic_id, topic_name, result_list):
    
    #buffer for useful information
    out_buffer = '\n'
    
    #path for output(directory)
-    output_path = './test/'#config.output
+    output_path = config.output
            
    #removes trailing new_lines
    old_target_string = topic_name.strip() #original target
@@ -449,7 +525,7 @@ def word_sense_induction(topic_id, topic_name, results):
    
    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, results[topic_id])
+    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
    
    #builds graph from these dictionaries, also applies multiple filters
    print('[a]', 'Building graph.\t('+old_target_string+')')
@@ -474,20 +550,20 @@ def word_sense_induction(topic_id, topic_name, results):
    T = components(G, H, target_string)

    #matches senses to clusters
-    print('[a]', 'Disambiguating results.\t('+old_target_string+')')
-    D = disambiguate(T, H, results[topic_id], target_string)
+    print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
+    D = disambiguate(T, H, result_list[topic_id], target_string)
    
    out_buffer += ('[A] Mapping: \n')
-    for cluster,results in D.items():
-        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
+    for cluster,result_list in D.items():
+        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list])))
    
    #prints buffer
    print('[a]', 'Writing to file.\t('+old_target_string+')')
    print(out_buffer)
    
    #writes clustering to file
-    for cluster,results in D.items():
-        for result in results:
+    for cluster,result_list in D.items():
+        for result in result_list:
            f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
        
    f.close()
@@ -526,7 +602,7 @@ def read_dataset(data_path):

 def main():
    
-    # If absinth.py is run in test environment
+    # If absinth.py is run in test environment.
    if '-t' in sys.argv:
        data_path = config.test
    else:
@@ -534,7 +610,13 @@ def main():
        
    results, topics = read_dataset(data_path)
    
-    with Pool(2) as pool:
+    # Enables manual setting of process count.
+    if '-p' in sys.argv:
+        process_count = int(sys.argv[sys.argv.index('-p') + 1])
+    else:
+        process_count = 1
+    
+    with Pool(process_count) as pool:
        parameter_list = [(topic_id, topic_name, results)
                          for topic_id,topic_name in topics.items()]
        pool.starmap(word_sense_induction, parameter_list)