From a2e527eb33170a4abbd731b7807a7b0f00979784 Mon Sep 17 00:00:00 2001
From: zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Sun, 18 Mar 2018 19:50:13 +0100
Subject: [PATCH] Commenting, restructuring

---
 src/absinth.py | 148 +++++++++++++++++++++++++++----------------------
 1 file changed, 81 insertions(+), 67 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index f9bbc22..a082b43 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -415,6 +415,59 @@ def score(graph, component, root_hub_list):
     return score_array
 
 
+def induce(topic_name, result_list):
+    """
+
+
+    """
+    
+    statistics = dict()
+            
+    #removes trailing new_lines
+    old_target_string = topic_name.strip() #original target
+    
+    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
+        return None
+    
+    statistics['target'] = old_target_string
+    
+    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
+        
+        target_string = old_target_string[4:]
+        
+    else:
+        
+        target_string = old_target_string
+    
+    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
+    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
+    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
+    
+    #builds graph from these dictionaries, also applies multiple filters
+    print('[a]', 'Building graph.\t('+old_target_string+')')
+    G = build_graph(node_freq_dict, edge_freq_dict)
+    
+    statistics['node count'] = len(G.nodes)
+    statistics['edge count'] = len(G.edges)
+
+    #finds root hubs (senses) within the graph + more filters for these
+    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
+    H = root_hubs(G, edge_freq_dict)
+    
+    #adds sense inventory to buffer with some common neighbors for context
+    statistics['hubs'] = dict()
+    for h in H:
+        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
+        statistics['hubs'][h] = mfn
+    
+    #performs minimum_spanning_tree algorithm on graph
+    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
+    T = components(G, H, target_string)
+
+    return T, H, statistics
+
+
 def disambiguate(minimum_spanning_tree, root_hub_list,
                  context_list, target_string):
     """Matches contexts to senses.
@@ -493,82 +546,46 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
     return mapping_dict
 
 
-# our main function, here the main stepps for word sense induction are called
-def word_sense_induction(topic_id, topic_name, result_list):
-    
-    #buffer for useful information
-    out_buffer = '\n'
-    
-    #path for output(directory)
-    output_path = config.output
-            
-    #removes trailing new_lines
-    old_target_string = topic_name.strip() #original target
-    
-    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
-        return None
-    
-    out_buffer += ("[A] Word sense induction for '"+old_target_string+"':\n")
-    
-    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
-    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
-        
-        target_string = old_target_string[4:]
-        
-    else:
-        
-        target_string = old_target_string
-    
-    #writes headline for output files
-    f = open(output_path+old_target_string+'.absinth', 'w')
-    f.write('subTopicID\tresultID\n')
-    
-    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
-    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
-    
-    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.\t('+old_target_string+')')
-    G = build_graph(node_freq_dict, edge_freq_dict)
-    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges)))
-    
-    #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
-    H = root_hubs(G, edge_freq_dict)
-    out_buffer += '[A] Root hubs:\n'
+def main(topic_id, topic_name, result_list):
+    """
+
+
+    """
     
-    #adds sense inventory to buffer with some common neighbors for context
-    i = 1 #sense index
-    for h in H:
-        
-        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
-        out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
-        i += 1
+    print('[a]', 'Inducing word senses for {}.'.format(topic_name))
+    T, H, statistics = induce(topic_name, result_list)
     
-    #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
-    T = components(G, H, target_string)
-
     #matches senses to clusters
     print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
     D = disambiguate(T, H, result_list[topic_id], target_string)
     
-    out_buffer += ('[A] Mapping: \n')
+    #collect statistics from result.
+    cluster_count = 0
+    cluster_length_list = list()
     for cluster,result_list in D.items():
-        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list])))
-    
+        cluster_length = len(result_list)
+        if cluster_length != 0:
+            cluster_count += 1
+            cluster_length_list.append(cluster_length)
+    statistics['mean_cluster_length'] = np.mean(cluster_length_list)
+    statistics['cluster_count'] = cluster_count
+
     #prints buffer
     print('[a]', 'Writing to file.\t('+old_target_string+')')
-    print(out_buffer)
     
+
+    f = open(output_path+old_target_string+'.absinth', 'w')
+
+    f.write('subTopicID\tresultID\n')
+
     #writes clustering to file
     for cluster,result_list in D.items():
         for result in result_list:
             f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
         
     f.close()
-    
-    
+        
+
 def read_dataset(data_path):
     
     # results.txt includes the queries for a given target word
@@ -600,8 +617,9 @@ def read_dataset(data_path):
     return results, topics
 
 
-def main():
-    
+
+
+if __name__ == '__main__':
     # If absinth.py is run in test environment.
     if '-t' in sys.argv:
         data_path = config.test
@@ -619,11 +637,7 @@ def main():
     with Pool(process_count) as pool:
         parameter_list = [(topic_id, topic_name, results)
                           for topic_id,topic_name in topics.items()]
-        pool.starmap(word_sense_induction, parameter_list)
+        pool.starmap(main, parameter_list)
         
     #for topic_id,topic_name in topics.items():
        #word_sense_induction(topic_id,topic_name, results)
-
-
-if __name__ == '__main__':
-    main()
-- 
GitLab