From a9b0962ebbb4688af93e6d400ff0420db96fe09c Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Mon, 19 Mar 2018 19:02:09 +0100
Subject: [PATCH] Further commenting.

---
 src/absinth.py | 281 +++++++++++++++++++++++++++++--------------------
 1 file changed, 164 insertions(+), 117 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index a082b43..9a33af7 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -1,23 +1,56 @@
 #!/usr/bin/env python3
 
 import sys
-import matplotlib
-matplotlib.use("Agg")
 print('[A] Loading ' + sys.argv[0] + '.\n')
-import os # for reading files
+import config
 import networkx as nx # for visualisation
-from copy import deepcopy
-from nltk.corpus import stopwords
-import numpy as np # for calculations
+import numpy as np
+import os # for reading files
+import pprint
+import random
 import re
 import spacy # for nlp
 from multiprocessing import Pool
-import random
-import matplotlib.pyplot as plt
-import config
+from nltk.corpus import stopwords
+from copy import deepcopy
 
 nlp = spacy.load('en') # standard english nlp
 
+
+def read_dataset(data_path):
+    """Collects topics.txt and results.txt.
+    
+    
+    """
+    
+    results = dict()
+    
+    with open(data_path+'results.txt', 'r') as results_file:
+        
+        for line in results_file.readlines()[1:]:
+            
+            l = line.split('\t')
+            id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
+            
+            if id1 not in results:
+                results[id1]=list()
+                
+            results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored
+            
+    
+    # topics.txt is a list of target words
+    topics = dict()
+    
+    with open(data_path+'topics.txt', 'r') as topics_file:
+        
+        for line in topics_file.readlines()[1:]:
+            
+            l = line.split('\t')
+            topics[l[0]] = l[1].strip()
+    
+    return results, topics
+
+
 def frequencies(target_string, search_result_list):
     """Counts occurrences of nodes and cooccurrences.
     
@@ -168,7 +201,10 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                         
                         # Add only tokens with allowed tags to nodes.
                         elif token.tag_ in allowed_tag_list:
-                            token_set.add(token.lemma_)
+                            if config.lemma == True:
+                                token_set.add(token.lemma_)
+                            else:
+                                token_set.add(token.text)
                             
                     context_size = len(token_set)
                     
@@ -416,78 +452,96 @@ def score(graph, component, root_hub_list):
 
 
 def induce(topic_name, result_list):
-    """
-
+    """Induces word senses for a given topic from corpus.
+    
+    Counts frequencies from corpus and search result list, builds graph from
+    these counts (with some filters). Root hubs (senses) are collected from
+    this graph.
 
+    Args:
+        topic_name: Target string.
+        result_list: List of search result (context) strings.
+        
+    Returns:
+        root_hub_list: List of root hub strings (senses).
+        stat dict: Various statistics.
     """
     
-    statistics = dict()
-            
-    #removes trailing new_lines
-    old_target_string = topic_name.strip() #original target
+    stat_dict = dict()
     
-    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
+    if topic_name in [output_file_name.replace('.absinth', '') 
+                      for output_file_name in os.listdir(config.output)]:
+        
         return None
     
-    statistics['target'] = old_target_string
+    else:
     
-    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
-    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
+        stat_dict['target'] = topic_name
         
-        target_string = old_target_string[4:]
+        #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+        if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
+            
+            target_string = topic_name[4:]
+            
+        else:
+            
+            target_string = topic_name
         
-    else:
+        print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
+        node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
         
-        target_string = old_target_string
-    
-    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
-    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
-    
-    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.\t('+old_target_string+')')
-    G = build_graph(node_freq_dict, edge_freq_dict)
-    
-    statistics['node count'] = len(G.nodes)
-    statistics['edge count'] = len(G.edges)
+        #builds graph from these dictionaries, also applies multiple filters
+        print('[a]', 'Building graph.\t('+topic_name+')')
+        graph = build_graph(node_freq_dict, edge_freq_dict)
+        
+        stat_dict['node count'] = len(graph.nodes)
+        stat_dict['edge count'] = len(graph.edges)
 
-    #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
-    H = root_hubs(G, edge_freq_dict)
-    
-    #adds sense inventory to buffer with some common neighbors for context
-    statistics['hubs'] = dict()
-    for h in H:
-        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
-        statistics['hubs'][h] = mfn
-    
-    #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
-    T = components(G, H, target_string)
+        #finds root hubs (senses) within the graph + more filters for these
+        print('[a]', 'Collecting root hubs.\t('+topic_name+')')
+        root_hub_list = root_hubs(graph, edge_freq_dict)
+        
+        #adds sense inventory to buffer with some common neighbors for context
+        stat_dict['hubs'] = dict()
+        
+        for root_hub in root_hub_list:
+        
+            by_frequency = lambda node: edge_freq_dict[root_hub,node] \
+                                         if root_hub < node \
+                                         else edge_freq_dict[node, root_hub]
+                                     
+            most_frequent_neighbor_list = sorted(graph.adj[root_hub],
+                                                 key=by_frequency, reverse=True) 
+            
+            stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]
 
-    return T, H, statistics
+        return graph, root_hub_list, stat_dict
 
 
-def disambiguate(minimum_spanning_tree, root_hub_list,
-                 context_list, target_string):
+def disambiguate(graph, root_hub_list, context_list, topic_name):
     """Matches contexts to senses.
     
-    Adds up scores for each token in a context string and matches the context
-    to the root hub with the highest score.
+    Builds minimum spanning tree from graph.
+    Adds up scores based on tree node distance for each token in a context 
+    string and matches the context to the root hub with the highest score.
     
     Args:
-        minimum_spanning_tree: Minimum spanning tree with target as root.
+        graph: Weighted undirected graph.
         root_hub_list: List of strings of root hubs (senses).
         context_list: List of sentence strings that are to be clustered.
-        target_string: String of target word, also root of MST.
+        topic_name: String of target word, also root of MST.
     
     Returns:
         mapping_dict: Dictionary of root hubs (senses) as keys and context ids
             as values.
     """
     
-    target_string = target_string.replace('_', ' ')
-    context_list = [context.lower().strip().replace(target_string, '')
+    #performs minimum_spanning_tree algorithm on graph
+    print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
+    minimum_spanning_tree = components(graph, root_hub_list, topic_name)
+    
+    spaced_topic_name = topic_name.replace('_', ' ')
+    context_list = [context.lower().strip().replace(spaced_topic_name, '')
                     for context in context_list]
     
     score_dict = dict() #memoisation for scores
@@ -505,23 +559,27 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
         idx += 1 #index based on position in list
     
         processed_context = nlp(context)
-        text_list = [token.text for token in processed_context] #tokens
+        
+        if config.lemma == True:
+            token_list = [token.lemma_ for token in processed_context] #tokens
+        else:
+            token_list = [token.text for token in processed_context] #tokens
         
         score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
         
-        for text in text_list:
+        for token in token_list:
             
-            if text in minimum_spanning_tree.nodes: #if word wasn't filtered out
+            if token in minimum_spanning_tree.nodes: #if word wasn't filtered out
                 
-                if text in score_dict: #memoisation
+                if token in score_dict: #memoisation
                         
-                    new_scores = score_dict[text]
+                    new_scores = score_dict[token]
                 
                 else:
                     
                     new_score = score(minimum_spanning_tree, 
-                                      text, root_hub_list)
-                    score_dict[text] = new_score #memoisation
+                                      token, root_hub_list)
+                    score_dict[token] = new_score #memoisation
                     
                 score_array += new_score
             
@@ -546,77 +604,66 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
     return mapping_dict
 
 
-def main(topic_id, topic_name, result_list):
-    """
-
+def main(topic_id, topic_name, result_dict):
+    """Calls induction and disambiguation functions, performs main task.
 
+    The task is to both induce senses and match search results to them. This
+    function calls in much the same way induce() and disambiguate() to perform 
+    these sub tasks. The result is then written to the output directory
+    specified in config.py.
+    
+    Args:
+        topic_id: Index of topic in topics.txt.
+        topic_name: Target string.
+        result_dict: Dictionary with topic_id as key and list of search queries
+            (from results.txt) as values.
+            
+    Returns:
+        None
     """
     
     print('[a]', 'Inducing word senses for {}.'.format(topic_name))
-    T, H, statistics = induce(topic_name, result_list)
+    graph, root_hub_list, stat_dict = induce(topic_name,
+                                                              result_dict[topic_id])
     
     #matches senses to clusters
-    print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
-    D = disambiguate(T, H, result_list[topic_id], target_string)
+    print('[a]', 'Disambiguating result_list.\t('+topic_name+')')
+    mapping_dict = disambiguate(graph, root_hub_list,
+                                result_dict[topic_id], topic_name)
     
     #collect statistics from result.
     cluster_count = 0
     cluster_length_list = list()
-    for cluster,result_list in D.items():
+    
+    for cluster,result_list in mapping_dict.items():
+        
         cluster_length = len(result_list)
+        
         if cluster_length != 0:
+            
             cluster_count += 1
             cluster_length_list.append(cluster_length)
-    statistics['mean_cluster_length'] = np.mean(cluster_length_list)
-    statistics['cluster_count'] = cluster_count
+            
+    stat_dict['mean_cluster_length'] = np.mean(cluster_length_list)
+    stat_dict['cluster_count'] = cluster_count
 
-    #prints buffer
-    print('[a]', 'Writing to file.\t('+old_target_string+')')
+    print('[a]', 'Writing to file.\t('+topic_name+')')
     
+    output_path = config.output
+    output_file_name = output_path+topic_name+'.absinth'
+    
+    with open(output_file_name, 'w') as output_file:
 
-    f = open(output_path+old_target_string+'.absinth', 'w')
-
-    f.write('subTopicID\tresultID\n')
+        output_file.write('subTopicID\tresultID\n')
 
-    #writes clustering to file
-    for cluster,result_list in D.items():
-        for result in result_list:
-            f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
-        
-    f.close()
-        
-
-def read_dataset(data_path):
-    
-    # results.txt includes the queries for a given target word
-    results = dict()
-    
-    with open(data_path+'results.txt', 'r') as results_file:
-        
-        for line in results_file.readlines()[1:]:
-            
-            l = line.split('\t')
-            id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
-            
-            if id1 not in results:
-                results[id1]=list()
+        for cluster_id,result_list in mapping_dict.items():
+            for result_id in result_list:
+                output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id,
+                                                      topic_id, result_id)
+                output_file.write(output_line)
                 
-            results[id1].append(" ".join(l[2:])) # here I join title and snippet, the URL is ignored
-            
-    
-    # topics.txt is a list of target words
-    topics = dict()
-    
-    with open(data_path+'topics.txt', 'r') as topics_file:
-        
-        for line in topics_file.readlines()[1:]:
-            
-            l = line.split('\t')
-            topics[l[0]] = l[1]
+    pprint.pprint(stat_dict)
     
-    return results, topics
-
-
 
 
 if __name__ == '__main__':
@@ -626,7 +673,7 @@ if __name__ == '__main__':
     else:
         data_path = config.dataset
         
-    results, topics = read_dataset(data_path)
+    result_dict, topic_dict = read_dataset(data_path)
     
     # Enables manual setting of process count.
     if '-p' in sys.argv:
@@ -635,8 +682,8 @@ if __name__ == '__main__':
         process_count = 1
     
     with Pool(process_count) as pool:
-        parameter_list = [(topic_id, topic_name, results)
-                          for topic_id,topic_name in topics.items()]
+        parameter_list = [(topic_id, topic_name, result_dict)
+                          for topic_id,topic_name in topic_dict.items()]
         pool.starmap(main, parameter_list)
         
     #for topic_id,topic_name in topics.items():
-- 
GitLab