Further commenting.

a9b0962e · Victor Zimmermann · a2e527eb · a9b0962e
Commit a9b0962e authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
 #!/usr/bin/env python3
 import sys
-import matplotlib
-matplotlib.use("Agg")
 print('[A] Loading ' + sys.argv[0] + '.\n')
-import os # for reading files
+import config
 import networkx as nx # for visualisation
-from copy import deepcopy
+import numpy as np
-from nltk.corpus import stopwords
+import os # for reading files
-import numpy as np # for calculations
+import pprint
+import random
 import re
 import spacy # for nlp
 from multiprocessing import Pool
-import random
+from nltk.corpus import stopwords
-import matplotlib.pyplot as plt
+from copy import deepcopy
-import config
 nlp = spacy.load('en') # standard english nlp
+def read_dataset(data_path):
+    """Collects topics.txt and results.txt.
+    """
+    results = dict()
+    with open(data_path+'results.txt', 'r') as results_file:
+        for line in results_file.readlines()[1:]:
+            l = line.split('\t')
+            id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
+            if id1 not in results:
+                results[id1]=list()
+            results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored
+    # topics.txt is a list of target words
+    topics = dict()
+    with open(data_path+'topics.txt', 'r') as topics_file:
+        for line in topics_file.readlines()[1:]:
+            l = line.split('\t')
+            topics[l[0]] = l[1].strip()
+    return results, topics
 def frequencies(target_string, search_result_list):
    """Counts occurrences of nodes and cooccurrences.
@@ -168,7 +201,10 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                        # Add only tokens with allowed tags to nodes.
                        elif token.tag_ in allowed_tag_list:
-                            token_set.add(token.lemma_)
+                            if config.lemma == True:
+                                token_set.add(token.lemma_)
+                            else:
+                                token_set.add(token.text)
                    context_size = len(token_set)
@@ -416,78 +452,96 @@ def score(graph, component, root_hub_list):
 def induce(topic_name, result_list):
-    """
+    """Induces word senses for a given topic from corpus.
+    Counts frequencies from corpus and search result list, builds graph from
+    these counts (with some filters). Root hubs (senses) are collected from
+    this graph.
+    Args:
+        topic_name: Target string.
+        result_list: List of search result (context) strings.
+    Returns:
+        root_hub_list: List of root hub strings (senses).
+        stat dict: Various statistics.
    """
-    statistics = dict()
+    stat_dict = dict()
-    #removes trailing new_lines
-    old_target_string = topic_name.strip() #original target
-    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
+    if topic_name in [output_file_name.replace('.absinth', '') 
+                      for output_file_name in os.listdir(config.output)]:
        return None
-    statistics['target'] = old_target_string
+    else:
-    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+        stat_dict['target'] = topic_name
-    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
-        target_string = old_target_string[4:]
+        #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+        if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
+            target_string = topic_name[4:]
+        else:
+            target_string = topic_name
-    else:
+        print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
+        node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
-        target_string = old_target_string
+        #builds graph from these dictionaries, also applies multiple filters
+        print('[a]', 'Building graph.\t('+topic_name+')')
-    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
+        graph = build_graph(node_freq_dict, edge_freq_dict)
-    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
-    node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
+        stat_dict['node count'] = len(graph.nodes)
+        stat_dict['edge count'] = len(graph.edges)
-    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.\t('+old_target_string+')')
-    G = build_graph(node_freq_dict, edge_freq_dict)
-    statistics['node count'] = len(G.nodes)
-    statistics['edge count'] = len(G.edges)
-    #finds root hubs (senses) within the graph + more filters for these
+        #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
+        print('[a]', 'Collecting root hubs.\t('+topic_name+')')
-    H = root_hubs(G, edge_freq_dict)
+        root_hub_list = root_hubs(graph, edge_freq_dict)
-    #adds sense inventory to buffer with some common neighbors for context
+        #adds sense inventory to buffer with some common neighbors for context
-    statistics['hubs'] = dict()
+        stat_dict['hubs'] = dict()
-    for h in H:
-        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
+        for root_hub in root_hub_list:
-        statistics['hubs'][h] = mfn
+            by_frequency = lambda node: edge_freq_dict[root_hub,node] \
-    #performs minimum_spanning_tree algorithm on graph
+                                         if root_hub < node \
-    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
+                                         else edge_freq_dict[node, root_hub]
-    T = components(G, H, target_string)
+            most_frequent_neighbor_list = sorted(graph.adj[root_hub],
+                                                 key=by_frequency, reverse=True) 
+            stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]
-    return T, H, statistics
+        return graph, root_hub_list, stat_dict
-def disambiguate(minimum_spanning_tree, root_hub_list,
+def disambiguate(graph, root_hub_list, context_list, topic_name):
-                 context_list, target_string):
    """Matches contexts to senses.
-    Adds up scores for each token in a context string and matches the context
+    Builds minimum spanning tree from graph.
-    to the root hub with the highest score.
+    Adds up scores based on tree node distance for each token in a context 
+    string and matches the context to the root hub with the highest score.
    Args:
-        minimum_spanning_tree: Minimum spanning tree with target as root.
+        graph: Weighted undirected graph.
        root_hub_list: List of strings of root hubs (senses).
        context_list: List of sentence strings that are to be clustered.
-        target_string: String of target word, also root of MST.
+        topic_name: String of target word, also root of MST.
    Returns:
        mapping_dict: Dictionary of root hubs (senses) as keys and context ids
            as values.
    """
-    target_string = target_string.replace('_', ' ')
+    #performs minimum_spanning_tree algorithm on graph
-    context_list = [context.lower().strip().replace(target_string, '')
+    print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
+    minimum_spanning_tree = components(graph, root_hub_list, topic_name)
+    spaced_topic_name = topic_name.replace('_', ' ')
+    context_list = [context.lower().strip().replace(spaced_topic_name, '')
                    for context in context_list]
    score_dict = dict() #memoisation for scores
@@ -505,23 +559,27 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
        idx += 1 #index based on position in list
        processed_context = nlp(context)
-        text_list = [token.text for token in processed_context] #tokens
+        if config.lemma == True:
+            token_list = [token.lemma_ for token in processed_context] #tokens
+        else:
+            token_list = [token.text for token in processed_context] #tokens
        score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
-        for text in text_list:
+        for token in token_list:
-            if text in minimum_spanning_tree.nodes: #if word wasn't filtered out
+            if token in minimum_spanning_tree.nodes: #if word wasn't filtered out
-                if text in score_dict: #memoisation
+                if token in score_dict: #memoisation
-                    new_scores = score_dict[text]
+                    new_scores = score_dict[token]
                else:
                    new_score = score(minimum_spanning_tree, 
-                                      text, root_hub_list)
+                                      token, root_hub_list)
-                    score_dict[text] = new_score #memoisation
+                    score_dict[token] = new_score #memoisation
                score_array += new_score
@@ -546,77 +604,66 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
    return mapping_dict
-def main(topic_id, topic_name, result_list):
+def main(topic_id, topic_name, result_dict):
-    """
+    """Calls induction and disambiguation functions, performs main task.
+    The task is to both induce senses and match search results to them. This
+    function calls in much the same way induce() and disambiguate() to perform 
+    these sub tasks. The result is then written to the output directory
+    specified in config.py.
+    Args:
+        topic_id: Index of topic in topics.txt.
+        topic_name: Target string.
+        result_dict: Dictionary with topic_id as key and list of search queries
+            (from results.txt) as values.
+    Returns:
+        None
    """
    print('[a]', 'Inducing word senses for {}.'.format(topic_name))
-    T, H, statistics = induce(topic_name, result_list)
+    graph, root_hub_list, stat_dict = induce(topic_name,
+                                                              result_dict[topic_id])
    #matches senses to clusters
-    print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
+    print('[a]', 'Disambiguating result_list.\t('+topic_name+')')
-    D = disambiguate(T, H, result_list[topic_id], target_string)
+    mapping_dict = disambiguate(graph, root_hub_list,
+                                result_dict[topic_id], topic_name)
    #collect statistics from result.
    cluster_count = 0
    cluster_length_list = list()
-    for cluster,result_list in D.items():
+    for cluster,result_list in mapping_dict.items():
        cluster_length = len(result_list)
        if cluster_length != 0:
            cluster_count += 1
            cluster_length_list.append(cluster_length)
-    statistics['mean_cluster_length'] = np.mean(cluster_length_list)
-    statistics['cluster_count'] = cluster_count
+    stat_dict['mean_cluster_length'] = np.mean(cluster_length_list)
+    stat_dict['cluster_count'] = cluster_count
-    #prints buffer
+    print('[a]', 'Writing to file.\t('+topic_name+')')
-    print('[a]', 'Writing to file.\t('+old_target_string+')')
+    output_path = config.output
+    output_file_name = output_path+topic_name+'.absinth'
+    with open(output_file_name, 'w') as output_file:
-    f = open(output_path+old_target_string+'.absinth', 'w')
+        output_file.write('subTopicID\tresultID\n')
-    f.write('subTopicID\tresultID\n')
-    #writes clustering to file
+        for cluster_id,result_list in mapping_dict.items():
-    for cluster,result_list in D.items():
+            for result_id in result_list:
-        for result in result_list:
+                output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id,
-            f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
+                                                      topic_id, result_id)
+                output_file.write(output_line)
-    f.close()
-def read_dataset(data_path):
-    # results.txt includes the queries for a given target word
-    results = dict()
-    with open(data_path+'results.txt', 'r') as results_file:
-        for line in results_file.readlines()[1:]:
-            l = line.split('\t')
-            id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
-            if id1 not in results:
-                results[id1]=list()
-            results[id1].append(" ".join(l[2:])) # here I join title and snippet, the URL is ignored
+    pprint.pprint(stat_dict)
-    # topics.txt is a list of target words
-    topics = dict()
-    with open(data_path+'topics.txt', 'r') as topics_file:
-        for line in topics_file.readlines()[1:]:
-            l = line.split('\t')
-            topics[l[0]] = l[1]
-    return results, topics
 if __name__ == '__main__':
@@ -626,7 +673,7 @@ if __name__ == '__main__':
    else:
        data_path = config.dataset
-    results, topics = read_dataset(data_path)
+    result_dict, topic_dict = read_dataset(data_path)
    # Enables manual setting of process count.
    if '-p' in sys.argv:
@@ -635,8 +682,8 @@ if __name__ == '__main__':
        process_count = 1
    with Pool(process_count) as pool:
-        parameter_list = [(topic_id, topic_name, results)
+        parameter_list = [(topic_id, topic_name, result_dict)
-                          for topic_id,topic_name in topics.items()]
+                          for topic_id,topic_name in topic_dict.items()]
        pool.starmap(main, parameter_list)
    #for topic_id,topic_name in topics.items():