Started renaming and commenting code.

657eb8e5 · Victor Zimmermann · 755bc6f9 · 657eb8e5
Commit 657eb8e5 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
+#!/usr/bin/env python3
+
 import sys
+import matplotlib
+matplotlib.use("Agg")
 print('[A] Loading ' + sys.argv[0] + '.\n')
 import os # for reading files
 import networkx as nx # for visualisation
 from copy import deepcopy
 from nltk.corpus import stopwords
 import numpy as np # for calculations
-import config
+import re
 import spacy # for nlp
 from multiprocessing import Pool
 import random
 import matplotlib.pyplot as plt
+import config

 nlp = spacy.load('en') # standard english nlp

-#counts occurences of nodes and cooccurrences
-def frequencies(corpus_path, target, results):
-    
-    max_nodes = config.max_nodes
-    max_edges = config.max_edges
+def frequencies(target_string, search_result_list):
+    """Counts occurrences of nodes and cooccurrences.
    
-    results = [r.replace('<b>', '').replace('</b>', '').replace(r'\\', '').strip() for r in results]
-    node_freq, edge_freq = process_file(results, target) #initialises frequencies with counts from results
+    Iterates over the corpus (and snippets provided with the task) line by line 
+    and counts every token and tuple of tokens within a line (context). These 
+    tokens is filtered by stop words, pos tags and context length.
    
-    files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
-    
-    i = 0 #for update print statements
-    for f in files:
+    Args:
+        target_string: contexts are selected if they contain this string. For
+            further processing this string is removed from the contexts.
+        search_result_list: List of titles and snippets provided with the task.
        
-        if i % int(len(files)/11) == 0: #prints update after every 10th of the corpus is parsed
+    Returns:
+        node_freq_dict: Dictionary of occurrences of every eligible token
+            within every context the target occurs in.
+        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
+            tokens within every context the target occurs in.
+    
+    """
+
+    corpus_path = config.corpus
+    max_node_count = config.max_nodes
+    max_edge_count = config.max_edges
+    
+    bracketed_target_string = '('+target_string+')'
+    
+    # Remove unnecessary tokens from snippets 
+    _search_result_list = list()
+    for r in search_result_list:
+        r = r.replace('<b>', '')
+        r = r.replace('</b>', '')
+        r = r.replace(r'\\', '')
+        r = r.strip()
+        _search_result_list.append(r)
+    
+    #initialises frequencies with counts from results
+    node_freq_dict, edge_freq_dict = process_file(_search_result_list,
+                                                  target_string,
+                                                  dict(),
+                                                  dict()) 
+    
+    #names of corpus files
+    corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)]
+    corpus_size = len(corpus_file_path_list)
+    
+    processed_file_count = 0
+    for corpus_file_path in corpus_file_path_list:
+        
+        node_count = len(node_freq_dict)
+        edge_count = len(edge_freq_dict)
+        
+        #prints update after every 11th of the corpus is parsed
+        if processed_file_count % int(corpus_size/11) == 0: 
            
-            file_ratio = i/len(files[:])
-            max_node_ratio = len(node_freq)/max_nodes
-            max_edge_ratio = len(edge_freq)/max_edges
+            file_ratio = processed_file_count / corpus_size
+            max_node_ratio = node_count / max_node_count
+            max_edge_ratio = edge_count / max_edge_count
            
            ratios = [file_ratio, max_node_ratio, max_edge_ratio]
            
            #uses the ratio closest to 100%.
-            percentage = int((max(ratios))*100)
+            highest_ratio = int((max(ratios))*100)
            
-            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq))+'\t('+target+')')
+            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio,
+                                                                   node_count,
+                                                                   edge_count,
+                                                                   bracketed_target_string))
        
        #checks maximum node values
-        if len(node_freq) > max_nodes:
-            print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')')
-            return node_freq, edge_freq
-        
-        #checks maximum edge values
-        if len(edge_freq) > max_edges:
-            print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')')
-            return node_freq, edge_freq
-        
-        with open(f, 'r') as lines: #parses single file
+        if node_count > max_node_count:
+            print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
+                                                               edge_count,
+                                                               bracketed_target_string))
+            return node_freq_dict, edge_freq_dict
+        
+        if edge_count > max_edge_count:
+            print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
+                                                               edge_count,
+                                                               bracketed_target_string))
+            return node_freq_dict, edge_freq_dict
+        
+        with open(corpus_file_path, 'r') as corpus_file:
            
-            node_freq, edge_freq = process_file(lines, target, node_freq, edge_freq)
+            node_freq_dict, edge_freq_dict = process_file(corpus_file,
+                                                          target_string,
+                                                          node_freq_dict,
+                                                          edge_freq_dict)
        
-        i += 1
+        processed_file_count += 1
    
-    #update print
-    print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')')
+    print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
+                                                       edge_count,
+                                                       bracketed_target_string))
    
-    return node_freq, edge_freq
+    return node_freq_dict, edge_freq_dict

-def process_file(lines, target, node_freq=None, edge_freq=None):

-    if node_freq is None:
-        node_freq = dict()
-    if edge_freq is None:
-        edge_freq = dict()
-    
-    s_target = target.replace('_', ' ') #target word with spaces
-    
-    stop_words = set(stopwords.words('english') + config.stop_words)
-    allowed_tags = config.allowed_tags
+def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
+    """Updates the counts of nodes and edges for a given document and target.
+    
+    Ammends the input dictionaries with counts from each context withing the
+    list of contexts. Furthermore filters out small contexts and tokens from
+    the stopword list or with wrong pos tags.
+    
+    Args:
+        context_list: List of contexts (lines, paragraphs) that are to be
+            considered for updating the counting dictionaries.
+        target_string: Target string for filtering out every context that does 
+            not contain it.
+        node_freq_dict: Dictionary of occurrences of every eligible token
+            within every context the target occurs in.
+        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
+            tokens within every context the target occurs in.
+    
+    Returns:
+        node_freq_dict: Updated version of the input node dict.
+        edge_freq_dict: Updated version of the input edge dict.
+    """
+    
+    spaced_target_string = target_string.replace('_', ' ')
+    
+    stopword_list = set(stopwords.words('english') + config.stop_words)
+    allowed_tag_list = config.allowed_tags
    min_context_size = config.min_context_size
        
    try:
        
-        for line in lines: #parses single paragraph
-            
-            line = line.lower()
+        for context in context_list:
            
-            if s_target in line: #greedy pre selection, not perfect
+            context = context.lower()
+            if spaced_target_string in context: #greedy pre selection, not perfect
                
-                tokens = set() #set of node candidates
-                doc = nlp(line.replace(s_target, target)) #nlp processing
+                token_set = set() #set of node candidates
                
-                if target in [t.text for t in doc]: #better selection
+                #This replacement allows target to be treated as single entity.
+                context = context.replace(spaced_target_string, target_string)
+                processed_context = nlp(context)
+                
+                if target_string in [token.text for token in processed_context]:
                    
-                    for tok in doc:
-                        
-                        text = tok.text #string value
-                        tag = tok.tag_ #pos tag
+                    for token in processed_context:
                        
                        #doesn't add target word to nodes
-                        if text == target:
+                        if token.text == target_string:
                            pass
                        
                        #doesn't add stop words to nodes
-                        elif text in stop_words:
+                        elif token.text in stopword_list:
                            pass
                        
                        #only adds tokens with allowed tags to nodes
-                        elif tag in allowed_tags:
-                            tokens.add(tok.text)
+                        elif token.tag_ in allowed_tag_list:
+                            token_set.add(token.text)
                            
-                    #if there are enough (good) tokens in paragraph
-                    if len(tokens) >= min_context_size:
-                        for token in tokens:
+                    context_size = len(token_set)
+                    
+                    if context_size >= min_context_size:
+                        for token in token_set:
                            
-                            #updates counts for nodes
-                            if token in node_freq:
-                                node_freq[token] += 1
+                            if token in node_freq_dict:
+                                node_freq_dict[token] += 1
                            else:
-                                node_freq[token] = 1
+                                node_freq_dict[token] = 1
                        
-                        for edge in {(x,y) for x in tokens for y in tokens if x < y}:
+                        #set of possible edges
+                        for edge in {(x,y) for x in token_set for y in token_set if x < y}:
                            
-                            #updates counts for edges
-                            if edge in edge_freq:
-                                edge_freq[edge] += 1
+                            if edge in edge_freq_dict:
+                                edge_freq_dict[edge] += 1
                            else:
-                                edge_freq[edge] = 1
+                                edge_freq_dict[edge] = 1
    
    #if a file is corrupted (can't always be catched with if-else)
    except UnicodeDecodeError:
        
-        pass
-        #print('Failed to decode:', f)              
+        pass            
    
-    return node_freq, edge_freq
+    return node_freq_dict, edge_freq_dict

 #build graph from frequency dictionaries
-def build_graph(node_freq, edge_freq):
+def build_graph(node_freq_dict, edge_freq_dict):
+    """Builds undirected weighted graph from dictionaries.
+    
+    Creates graph and appends every edge and node in the parameter dictionaries,
+    given they occur frequently enough. For every edge a weight is calculated.
+    
+    Args:
+        node_freq_dict: Dictionary of occurrences of every eligible token
+            within every context the target occurs in.
+        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
+            tokens within every context the target occurs in.
+    
+    Returns:
+        cooccurence_graph: Filtered undirected dice weighted small word 
+            cooccurence graph for a given target entity.
+    """
    
    min_node_freq = config.min_node_freq
    min_edge_freq = config.min_edge_freq
    max_weight = config.max_weight
    
-    G = nx.Graph()
+    cooccurence_graph = nx.Graph()
    
    #node : node frequency
-    for key, value in node_freq.items():
+    for node, frequency in node_freq_dict.items():
        
-        if value >= min_node_freq:
-            G.add_node(key)
+        if frequency >= min_node_freq:
+            cooccurence_graph.add_node(node)
            
    #edge : edge frequency
-    for key, value in edge_freq.items():
+    for node_tuple, frequency in edge_freq_dict.items():
+        
+        if frequency < min_edge_freq:
+            
+            continue
        
-        if value < min_edge_freq:
+        elif node_tuple[0] not in cooccurence_graph.nodes:
+            
            continue
        
-        if key[0] not in G.nodes or key[1] not in G.nodes:
+        elif node_tuple[1] not in cooccurence_graph.nodes:
+            
            continue
        
-        weight = 1 - max(edge_freq[key]/node_freq[key[0]], edge_freq[key]/node_freq[key[1]])
-        if weight <= max_weight:
-            G.add_edge(*key, weight=weight)
+        else:
+            
+            cooccurrence_frequency = edge_freq_dict[node_tuple]
+            node0_frequency = node_freq_dict[node_tuple[0]]
+            node1_frequency = node_freq_dict[node_tuple[1]]
+            
+            prob_0 = cooccurrence_frequency / node0_frequency
+            prob_1 = cooccurrence_frequency / node1_frequency
+            
+            #best_weight = 1 - max(prob_0, prob_1)
+            dice_weight = 1 - ((prob_0 + prob_1) / 2)
+            
+            if dice_weight <= max_weight:
+                
+                cooccurence_graph.add_edge(*node_tuple, weight=dice_weight)
+            
+            else:
+                
+                pass
    
-    return G
+    return cooccurence_graph


 #Identifies senses by choosing nodes with high degrees
-def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
+def root_hubs(graph, edge_freq_dict, min_neighbors=4, theshold=0.8):
    
    min_neighbors = config.min_neighbors
    threshold = config.threshold
@@ -177,7 +281,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
        
        if G.degree[v] >= min_neighbors:
        
-            mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #most frequent neighbors
+            mfn = sorted(G.adj[v], key=lambda key: edge_freq_dict[v,key] if v < key else edge_freq_dict[key, v], reverse=True)[:min_neighbors] #most frequent neighbors
            
            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold
                
@@ -202,11 +306,11 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):


 #Components algorithm from Véronis (2004), converts graph for target into a MST
-def components(graph, hubs, target):
+def components(graph, hubs, target_string):
    
    G = deepcopy(graph)
    H = hubs #root hubs
-    t = target
+    t = target_string
    
    #G.add_node(t)
    #for h in H:
@@ -246,12 +350,12 @@ def score(graph, from_node, to_node):


 # Basically Word Sense Disambiguation, matches context to sense
-def disambiguate(mst, hubs, contexts, target):
+def disambiguate(mst, hubs, contexts, target_string):
    
-    target = target.replace('_', ' ')
+    target_string = target_string.replace('_', ' ')
    T = mst #minimum spanning tree
    H = hubs #root hubs
-    C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts
+    C = [c.lower().strip().replace(target_string, '') for c in contexts] #cleaned up contexts
    
    score_dict = dict() #memoisation for scores
    mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
@@ -312,80 +416,73 @@ def disambiguate(mst, hubs, contexts, target):

    return mapping_dict

-def draw_graph(G, name):
-    nx.draw_networkx(G,pos=nx.spring_layout(G), with_labels=True, node_size=40, font_size=9, node_color='#2D98DA')
-    plt.savefig('../figures/'+name+'.png', dpi=200, bbox_inches='tight')
-    plt.clf()

 # our main function, here the main stepps for word sense induction are called
-def WSI(topic_id, topic_name, results):
+def word_sense_induction(topic_id, topic_name, results):
    
    #buffer for useful information
    out_buffer = '\n'
    
-    #paths for input (corpus) and output(directory)
-    corpus_path = config.corpus
-    output_path = config.output
+    #path for output(directory)
+    output_path = './test/'#config.output
            
    #removes trailing new_lines
-    old_target = topic_name.strip() #original target
+    old_target_string = topic_name.strip() #original target
    
-    if old_target.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
+    if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
        return None
    
-    out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
+    out_buffer += ("[A] Word sense induction for '"+old_target_string+"':\n")
    
    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
-    if old_target[:4] == 'the_' and old_target.count('_') >= 2:
+    if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
        
-        target = old_target[4:]
+        target_string = old_target_string[4:]
        
    else:
        
-        target = old_target
+        target_string = old_target_string
    
    #writes headline for output files
-    f = open(output_path+target+'.absinth', 'w')
+    f = open(output_path+target_string+'.absinth', 'w')
    f.write('subTopicID\tresultID\n')
    
    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
-    print('[a]', 'Counting nodes and edges.\t('+old_target+')')
-    node_freq, edge_freq = frequencies(corpus_path, target, results[topic_id])
+    print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
+    node_freq_dict, edge_freq_dict = frequencies(target_string, results[topic_id])
    
    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.\t('+old_target+')')
-    G = build_graph(node_freq, edge_freq)
-    draw_graph(G, topic_name.strip()+'_g')
+    print('[a]', 'Building graph.\t('+old_target_string+')')
+    G = build_graph(node_freq_dict, edge_freq_dict)
    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges)))
    
    #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.\t('+old_target+')')
-    H = root_hubs(G, edge_freq)
+    print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
+    H = root_hubs(G, edge_freq_dict)
    out_buffer += '[A] Root hubs:\n'
    
    #adds sense inventory to buffer with some common neighbors for context
    i = 1 #sense index
    for h in H:
        
-        mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
+        mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
        out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
        i += 1
    
    #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.\t('+old_target+')')
-    T = components(G, H, target)
-    draw_graph(T, topic_name.strip()+'_t')
+    print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
+    T = components(G, H, target_string)

    #matches senses to clusters
-    print('[a]', 'Disambiguating results.\t('+old_target+')')
-    D = disambiguate(T, H, results[topic_id], target)
+    print('[a]', 'Disambiguating results.\t('+old_target_string+')')
+    D = disambiguate(T, H, results[topic_id], target_string)
    
    out_buffer += ('[A] Mapping: \n')
    for cluster,results in D.items():
        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
    
    #prints buffer
-    print('[a]', 'Writing to file.\t('+old_target+')')
+    print('[a]', 'Writing to file.\t('+old_target_string+')')
    print(out_buffer)
    
    #writes clustering to file
@@ -395,14 +492,8 @@ def WSI(topic_id, topic_name, results):
        
    f.close()
    
-
-if __name__ == '__main__':
    
-    # If absinth.py is run in test environment
-    if '-t' in sys.argv:
-        data_path = config.test
-    else:
-        data_path = config.dataset
+def read_dataset(data_path):
    
    # results.txt includes the queries for a given target word
    results = dict()
@@ -430,10 +521,27 @@ if __name__ == '__main__':
            l = line.split('\t')
            topics[l[0]] = l[1]
    
-    # multiprocessing
-    with Pool(5) as pool:
-        # calls WSI() for for topics at a time
-        pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
+    return results, topics
+
+
+def main():
+    
+    # If absinth.py is run in test environment
+    if '-t' in sys.argv:
+        data_path = config.test
+    else:
+        data_path = config.dataset
+        
+    results, topics = read_dataset(data_path)
+    
+    with Pool(2) as pool:
+        parameter_list = [(topic_id, topic_name, results)
+                          for topic_id,topic_name in topics.items()]
+        pool.starmap(word_sense_induction, parameter_list)
        
-    #for key, value in topics.items():
-       #WSI(key, value, results)
+    #for topic_id,topic_name in topics.items():
+       #word_sense_induction(topic_id,topic_name, results)
+
+
+if __name__ == '__main__':
+    main()