Commenting, restructuring

a2e527eb · Victor Zimmermann · 6ba4aad4 · a2e527eb
Commit a2e527eb authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -415,6 +415,59 @@ def score(graph, component, root_hub_list):
    return score_array


+def induce(topic_name, result_list):
+    """
+
+
+    """
+    
+    statistics = dict()
+            
+    #removes trailing new_lines
+    old_target_string = topic_name.strip() #original target
+    
+    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
+        return None
+    
+    statistics['target'] = old_target_string
+    
+    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
+        
+        target_string = old_target_string[4:]
+        
+    else:
+        
+        target_string = old_target_string
+    
+    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
+    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
+    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
+    
+    #builds graph from these dictionaries, also applies multiple filters
+    print('[a]', 'Building graph.\t('+old_target_string+')')
+    G = build_graph(node_freq_dict, edge_freq_dict)
+    
+    statistics['node count'] = len(G.nodes)
+    statistics['edge count'] = len(G.edges)
+
+    #finds root hubs (senses) within the graph + more filters for these
+    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
+    H = root_hubs(G, edge_freq_dict)
+    
+    #adds sense inventory to buffer with some common neighbors for context
+    statistics['hubs'] = dict()
+    for h in H:
+        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
+        statistics['hubs'][h] = mfn
+    
+    #performs minimum_spanning_tree algorithm on graph
+    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
+    T = components(G, H, target_string)
+
+    return T, H, statistics
+
+
 def disambiguate(minimum_spanning_tree, root_hub_list,
                 context_list, target_string):
    """Matches contexts to senses.
@@ -493,82 +546,46 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
    return mapping_dict


-# our main function, here the main stepps for word sense induction are called
-def word_sense_induction(topic_id, topic_name, result_list):
-    
-    #buffer for useful information
-    out_buffer = '\n'
-    
-    #path for output(directory)
-    output_path = config.output
-            
-    #removes trailing new_lines
-    old_target_string = topic_name.strip() #original target
-    
-    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
-        return None
-    
-    out_buffer += ("[A] Word sense induction for '"+old_target_string+"':\n")
-    
-    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
-    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
-        
-        target_string = old_target_string[4:]
-        
-    else:
-        
-        target_string = old_target_string
-    
-    #writes headline for output files
-    f = open(output_path+old_target_string+'.absinth', 'w')
-    f.write('subTopicID\tresultID\n')
-    
-    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
-    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
-    
-    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.\t('+old_target_string+')')
-    G = build_graph(node_freq_dict, edge_freq_dict)
-    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges)))
-    
-    #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
-    H = root_hubs(G, edge_freq_dict)
-    out_buffer += '[A] Root hubs:\n'
+def main(topic_id, topic_name, result_list):
+    """
+
+
+    """
    
-    #adds sense inventory to buffer with some common neighbors for context
-    i = 1 #sense index
-    for h in H:
-        
-        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
-        out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
-        i += 1
+    print('[a]', 'Inducing word senses for {}.'.format(topic_name))
+    T, H, statistics = induce(topic_name, result_list)
    
-    #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
-    T = components(G, H, target_string)
-
    #matches senses to clusters
    print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
    D = disambiguate(T, H, result_list[topic_id], target_string)
    
-    out_buffer += ('[A] Mapping: \n')
+    #collect statistics from result.
+    cluster_count = 0
+    cluster_length_list = list()
    for cluster,result_list in D.items():
-        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list])))
-    
+        cluster_length = len(result_list)
+        if cluster_length != 0:
+            cluster_count += 1
+            cluster_length_list.append(cluster_length)
+    statistics['mean_cluster_length'] = np.mean(cluster_length_list)
+    statistics['cluster_count'] = cluster_count
+
    #prints buffer
    print('[a]', 'Writing to file.\t('+old_target_string+')')
-    print(out_buffer)
    
+
+    f = open(output_path+old_target_string+'.absinth', 'w')
+
+    f.write('subTopicID\tresultID\n')
+
    #writes clustering to file
    for cluster,result_list in D.items():
        for result in result_list:
            f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
        
    f.close()
-    
-    
+        
+
 def read_dataset(data_path):
    
    # results.txt includes the queries for a given target word
@@ -600,8 +617,9 @@ def read_dataset(data_path):
    return results, topics


-def main():
-    
+
+
+if __name__ == '__main__':
    # If absinth.py is run in test environment.
    if '-t' in sys.argv:
        data_path = config.test
@@ -619,11 +637,7 @@ def main():
    with Pool(process_count) as pool:
        parameter_list = [(topic_id, topic_name, results)
                          for topic_id,topic_name in topics.items()]
-        pool.starmap(word_sense_induction, parameter_list)
+        pool.starmap(main, parameter_list)
        
    #for topic_id,topic_name in topics.items():
       #word_sense_induction(topic_id,topic_name, results)
-
-
-if __name__ == '__main__':
-    main()