#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Word Sense Induction system for SemEval 2013, Task 11

This module performs word sense induction for a given word on a corpus and
matches a list of contexts to each. The method to achieve this is a modified
reimplementation of Véronis' Hyperlex (2004).

Example:
    The function can be called with the following command.:
    
        $ python3 absinth.py
        
    The function can be called with a list of modifiers.
    
Modifiers:
    '-t': Runs absinth.py on the trial path given in the config.py instead of the
        data_path.
    '-p n': Runs absinth.py with n concurrent processes (standard: 1). 

.. _Association Based Semantic Induction Tools from Heidelberg
    https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth

"""

import sys
print('[A] Loading ' + sys.argv[0] + '.\n')
import config
import networkx as nx # for visualisation
import numpy as np
import os # for reading files
import pprint
import re
import spacy # for nlp

from multiprocessing import Pool
from copy import deepcopy


nlp = spacy.load('en') # standard english nlp


def read_dataset(data_path: str) -> (dict, dict):
    """Collects topics.txt and results.txt.
    
    Iterates over topics.txt and results.txt in the data path and converts them
    to dictionaries with the ID as key and the target word / title + snippet as
    values.
    
    Args:
        data_path: File path to directory containing topics.txt and results.txt.
        
    Returns:
        One dictionary for each file. 
    """
    
    results = dict()
    
    with open(data_path+'results.txt', 'r') as results_file:
        
        for line in results_file.readlines()[1:]:
            
            l = line.split('\t')
            id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
            
            if id1 not in results:
                results[id1]=list()
                
            results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored
            
    
    # topics.txt is a list of target words
    topics = dict()
    
    with open(data_path+'topics.txt', 'r') as topics_file:
        
        for line in topics_file.readlines()[1:]:
            
            l = line.split('\t')
            topics[l[0]] = l[1].strip()
    
    return results, topics


def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
    """Counts occurrences of nodes and cooccurrences.
    
    Iterates over the corpus (and snippets provided with the task) line by line 
    and counts every token and tuple of tokens within a line (context). These 
    tokens is filtered by stop words, pos tags and context length.
    
    Args:
        target_string: contexts are selected if they contain this string. For
            further processing this string is removed from the contexts.
        search_result_list: List of titles and snippets provided with the task.
        
    Returns:
        Dictionary of occurrences of every eligible token within every context
            the target occurs in, dictionary of occurrences of every eligible
            tuple of tokens within every context the target occurs in.
    
    """

    corpus_path = config.corpus
    max_node_count = config.max_nodes
    max_edge_count = config.max_edges
    
    bracketed_target_string = '('+target_string+')'
    
    # Remove unnecessary tokens from snippets.
    _search_result_list = list()
    for r in search_result_list:
        r = r.replace('<b>', '')
        r = r.replace('</b>', '')
        r = r.replace(r'\\', '')
        r = r.strip()
        _search_result_list.append(r)
    
    # Initialise frequencies with counts from results.
    node_freq_dict, edge_freq_dict = process_file(_search_result_list,
                                                  target_string,
                                                  dict(),
                                                  dict()) 
    
    corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)]
    corpus_size = len(corpus_file_path_list)
    
    processed_file_count = 0
    for corpus_file_path in corpus_file_path_list:
        
        node_count = len(node_freq_dict)
        edge_count = len(edge_freq_dict)
        
        # Print update after every 11th of the corpus is parsed.
        if processed_file_count % int(corpus_size/11) == 0: 
            
            file_ratio = processed_file_count / corpus_size
            max_node_ratio = node_count / max_node_count
            max_edge_ratio = edge_count / max_edge_count
            
            ratios = [file_ratio, max_node_ratio, max_edge_ratio]
            
            # Use ratio closest to 100%.
            highest_ratio = int((max(ratios))*100)
            
            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio,
                                                                   node_count,
                                                                   edge_count,
                                                                   bracketed_target_string))
        
        if node_count > max_node_count:
            print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
                                                               edge_count,
                                                               bracketed_target_string))
            return node_freq_dict, edge_freq_dict
        
        if edge_count > max_edge_count:
            print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
                                                               edge_count,
                                                               bracketed_target_string))
            return node_freq_dict, edge_freq_dict
        
        with open(corpus_file_path, 'r') as corpus_file:
            
            node_freq_dict, edge_freq_dict = process_file(corpus_file,
                                                          target_string,
                                                          node_freq_dict,
                                                          edge_freq_dict)
        
        processed_file_count += 1
    
    print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
                                                       edge_count,
                                                       bracketed_target_string))
    
    return node_freq_dict, edge_freq_dict


def process_file(context_list: list, target_string: str,
                 node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict): 
    """Updates the counts of nodes and edges for a given document and target.
    
    Ammends the input dictionaries with counts from each context withing the
    list of contexts. Furthermore filters out small contexts and tokens from
    the stopword list or with wrong pos tags.
    
    Args:
        context_list: List of contexts (lines, paragraphs) that are to be
            considered for updating the counting dictionaries.
        target_string: Target string for filtering out every context that does 
            not contain it.
        node_freq_dict: Dictionary of occurrences of every eligible token
            within every context the target occurs in.
        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
            tokens within every context the target occurs in.
    
    Returns:
        Updated versions of the input node dict and input edge dict.
    """
    
    spaced_target_string = target_string.replace('_', ' ')
    
    stopword_list = config.stop_words
    allowed_tag_list = config.allowed_tags
    min_context_size = config.min_context_size
        
    try:
        
        for context in context_list:
            
            context = context.lower()
            if spaced_target_string in context: # Pre-select lines greedy.
                
                token_set = set()
                
                # Allow target to be treated as single entity.
                context = context.replace(spaced_target_string, target_string)
                processed_context = nlp(context)
                
                if target_string in [token.text for token in processed_context]:
                    
                    for token in processed_context:
                        
                        # Do not add target word to nodes.
                        if token.text == target_string:
                            pass
                        
                        # Do not add stop words to nodes.
                        elif token.is_stop or token.text in stopword_list:
                            pass
                        
                        # Add only tokens with allowed tags to nodes.
                        elif token.tag_ in allowed_tag_list:
                            if config.lemma == True:
                                token_set.add(token.lemma_)
                            else:
                                token_set.add(token.text)
                            
                    context_size = len(token_set)
                    
                    if context_size >= min_context_size:
                        for token in token_set:
                            
                            if token in node_freq_dict:
                                node_freq_dict[token] += 1
                            else:
                                node_freq_dict[token] = 1
                        
                        #set of possible edges
                        for edge in {(x,y) for x in token_set for y in token_set if x < y}:
                            
                            if edge in edge_freq_dict:
                                edge_freq_dict[edge] += 1
                            else:
                                edge_freq_dict[edge] = 1
    
    # If file is corrupted (can't always be catched with if-else), ignore file.
    except UnicodeDecodeError:
        
        pass            
    
    return node_freq_dict, edge_freq_dict


def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph:
    """Builds undirected weighted graph from dictionaries.
    
    Creates graph and appends every edge and node in the parameter dictionaries,
    given they occur frequently enough. For every edge a weight is calculated.
    
    Args:
        node_freq_dict: Dictionary of occurrences of every eligible token
            within every context the target occurs in.
        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
            tokens within every context the target occurs in.
    
    Returns:
        Filtered undirected dice weighted small word cooccurrence graph for a 
            given target entity.
    """
    
    min_node_freq = config.min_node_freq
    min_edge_freq = config.min_edge_freq
    max_weight = config.max_weight
    
    cooccurrence_graph = nx.Graph()
    
    for node, frequency in node_freq_dict.items():
        
        if frequency >= min_node_freq:
            cooccurrence_graph.add_node(node)
            
    for node_tuple, frequency in edge_freq_dict.items():
        
        if frequency < min_edge_freq:
            
            continue
        
        elif node_tuple[0] not in cooccurrence_graph.nodes:
            
            continue
        
        elif node_tuple[1] not in cooccurrence_graph.nodes:
            
            continue
        
        else:
            
            cooccurrence_frequency = edge_freq_dict[node_tuple]
            node0_frequency = node_freq_dict[node_tuple[0]]
            node1_frequency = node_freq_dict[node_tuple[1]]
            
            prob_0 = cooccurrence_frequency / node0_frequency
            prob_1 = cooccurrence_frequency / node1_frequency
            
            best_weight = 1 - max(prob_0, prob_1)
            #dice_weight = 1 - ((prob_0 + prob_1) / 2)
            
            if best_weight <= max_weight:
                
                cooccurrence_graph.add_edge(*node_tuple, weight=best_weight)
            
            else:
                
                pass
    
    return cooccurrence_graph


def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
    """Identifies senses (root hubs) by choosing nodes with high degrees
    
    Selects root hubs according to the algorithm in Véronis (2004). Nodes with
    high degree and neighbors with low weights (high cooccurrence) are chosen
    until there are no more viable candidates. A root hub candidate is every
    node that is not already a hub and is not a neighbor of one.
    
    Args:
        graph: Weighted undirected graph.
        edge_freq_dict: Dictionary of weights for every tuple in our graph.
        
    Returns:
        List of root hubs, i.e. strings that are selected using the algorithm
            explained above.
    """
    
    min_neighbors = config.min_neighbors
    threshold = config.threshold
    
    # Allow operations on graph without altering original one.
    graph_copy = deepcopy(graph)
    
    # Sort according to degree (number of neighbors).
    candidate_list = sorted(graph_copy.nodes,
                            key=lambda node: graph_copy.degree[node],
                            reverse=True)
    
    hub_list = list()
    
    # While there are still candidates, search for root hubs.
    while candidate_list:
        
        candidate = candidate_list[0] #best hub candidate
        
        if graph_copy.degree[candidate] >= min_neighbors:
        
            by_frequency = lambda node: edge_freq_dict[candidate,node] \
                                         if candidate < node \
                                         else edge_freq_dict[node,candidate]
                              
            most_frequent_neighbor_list = sorted(graph_copy.adj[candidate],
                                                 key=by_frequency,
                                                 reverse=True) [:min_neighbors] 
            
            # If the mean weight of the most frequent neighbors cooccur
            # frequently enough with candidate, the candidate is approved.
            if np.mean([graph_copy.edges[candidate,node]['weight'] 
                        for node in most_frequent_neighbor_list])  < threshold: 
                
                # Add candidate as root hub.
                hub_list.append(candidate)
            
                # Remove neighbors of new hub as hub candidates.
                for neighbor in deepcopy(graph_copy).adj[candidate]:
                    graph_copy.remove_node(neighbor)
                
            # Remove hub candidate.
            graph_copy.remove_node(candidate)
            
            # Reorder potential hubs after deletions.
            candidate_list = sorted(graph_copy.nodes,
                                    key=lambda node: graph_copy.degree[node],
                                    reverse=True)
        
        else:
        
            return hub_list
    
    return hub_list


def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
    """Builds minimum spanning tree from graph and removes singletons.
    
    Applies components algorithm from Véronis (2004) and removes singletons.
    
    Args:
        graph: Undirected weighted graph.
        root_hub_list: List of strings of root hubs of graph.
        target_string: Root of minimum spanning tree.
        
    Returns:
        Minimum spanning tree with target as root and root hubs as direct
            children. Singletons removed.
    """
    
    graph_copy = deepcopy(graph)
    
    graph_copy.add_node(target_string)
    for root_hub in root_hub_list:
        graph_copy.add_edge(target_string,root_hub,weight=0)
        
    minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
    
    # Remove singletons, deepcopy for iteration while being altered.
    for node in deepcopy(minimum_spanning_tree).nodes:
        if len(minimum_spanning_tree.adj[node]) == 0:
            minimum_spanning_tree.remove_node(node)
    
    return minimum_spanning_tree


def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
    """Calculate score for a given component in a minimum spanning tree.
    
    First the correct root for the component is chosen. If no root hub is
    suitable, an empty array is returned. A score is calculated for the distance
    of the component and its root and returned as part of an array filled with
    zeroes.
    
    Args:
        graph: Minimum spanning tree.
        component: Node (string) from which the distances are to be calculated.
        root_hub_list: List of strings of root hubs (senses) of original graph.
    
    Returns:
        Array with one score for the correct root hub and filled with zeroes.
    """
    
    root_hub_count = len(root_hub_list)
    
    #Initialise score array.
    score_array = np.zeros(root_hub_count)
    
    # Find root of component.
    distance_list = list()
    for root_hub in root_hub_list:
        if nx.has_path(graph, component, root_hub):
            distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
        else:
            distance_list.append(0)
            
    if sum(distance_list) == 0:
        return score_array
            
    root_idx = np.argmax(distance_list)
    root = root_hub_list[root_idx]
    
    shortest_path = nx.shortest_path(graph, component, root, 'weight')
    total_weight = 0

    # Add weights of every sub-path.
    for i in range(1, len(shortest_path)):
        sub_from, sub_to = shortest_path[i-1], shortest_path[i]
        total_weight += graph[sub_from][sub_to]['weight']

    score_array = np.zeros(root_hub_count)
    score_array[root_idx] = 1/(1+total_weight)
    
    return score_array


def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
    """Induces word senses for a given topic from corpus.
    
    Counts frequencies from corpus and search result list, builds graph from
    these counts (with some filters). Root hubs (senses) are collected from
    this graph.

    Args:
        topic_name: Target string.
        result_list: List of search result (context) strings.
        
    Returns:
        A cooccurrence graph,
            a list of root hub strings (senses) 
            and dictionary of various statistics.
    """
    
    stat_dict = dict()
    
    if topic_name in [output_file_name.replace('.absinth', '') 
                      for output_file_name in os.listdir(config.output)]:
        
        return None
    
    else:
    
        stat_dict['target'] = topic_name
        
        #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
        if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
            
            target_string = topic_name[4:]
            
        else:
            
            target_string = topic_name
        
        print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
        node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
        
        #builds graph from these dictionaries, also applies multiple filters
        print('[a]', 'Building graph.\t('+topic_name+')')
        graph = build_graph(node_freq_dict, edge_freq_dict)
        
        stat_dict['node count'] = len(graph.nodes)
        stat_dict['edge count'] = len(graph.edges)

        #finds root hubs (senses) within the graph + more filters for these
        print('[a]', 'Collecting root hubs.\t('+topic_name+')')
        root_hub_list = root_hubs(graph, edge_freq_dict)
        
        #adds sense inventory to buffer with some common neighbors for context
        stat_dict['hubs'] = dict()
        
        for root_hub in root_hub_list:
        
            by_frequency = lambda node: edge_freq_dict[root_hub,node] \
                                         if root_hub < node \
                                         else edge_freq_dict[node, root_hub]
                                     
            most_frequent_neighbor_list = sorted(graph.adj[root_hub],
                                                 key=by_frequency, reverse=True) 
            
            stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]

        return graph, root_hub_list, stat_dict


def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
    """Colours graph accoring to root hubs.
    
    Evolving network that colours neighboring nodes iterative. See sentiment
    propagation.
    
    Args:
        graph: Weighted undirected graph.
        root_hub_list: List of senses.
        
    Returns:
        Coloured graph.
    """
    
    
    for node in graph.nodes:
        
        graph.node[node]['dist'] = [0] * len(root_hub_list)
            
        if node in root_hub_list:
            
            root_idx = root_hub_list.index(node)
            
            graph.node[node]['sense'] = root_idx
            graph.node[node]['dist'][root_idx] = 1
            
        else:
            
            graph.node[node]['sense'] = None
    
    max_iteration_count = config.max_colour_iteration_count
    
    iteration_count = 0
    stable = False
    
    while stable == False and iteration_count <= max_iteration_count:
        
        graph_copy = deepcopy(graph)
        iteration_count += 1
        stable = True
        
        for node in graph.nodes:
            
            neighbor_weight_list = [0] * len(root_hub_list)
            
            for neighbor in graph_copy[node]:
                
                if graph_copy.node[neighbor]['sense'] == None:
                    pass
                else:
                    neighbor_weight_list[graph_copy.node[neighbor]['sense']] \
                        += 1 - graph_copy[node][neighbor]['weight']
            
            
            if any(neighbor_weight_list):
                
                graph.node[node]['dist'] = np.mean([graph.node[node]['dist'],
                                                    neighbor_weight_list], axis=0)
                
                old_colour = graph_copy.node[node]['sense']
                new_colour = np.argmax(graph.node[node]['dist'])
                
                if old_colour != new_colour:
                    stable = False
                    graph.node[node]['sense'] = new_colour
                
                else:
                    pass
                
            else:
                
                pass
    
    return graph
    
    
def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict:
    """Clusters senses to root hubs using a coloured graph.
    
    This algorithm colours the graph using evolutionary graph theory
    and calculates scores for each root hub given a context based on this graph.
    
    Args:
        graph: Undirected weighted graph.
        root_hub_list: List of root hubs (senses).
        context_list: List of search result strings to be clustered.
        
    Returns:
        A dictionary with root hub IDs as keys and context indices as values.
    """
    
    coloured_graph = colour_graph(graph, root_hub_list)
    
    mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)}
    
    if len(root_hub_list) == 0:
        
        mapping_dict = {0:[i for i in range(1, len(context_list)+1)]}
        
        return mapping_dict
    
    context_id = 0
    for context in context_list:
        
        context_id += 1
        score = [0]*len(root_hub_list)
        parsed_context = nlp(context)
        
        for token in parsed_context:
            
            if config.lemma == True:
                text = token.lemma_
            else:
                text = token.text
            
            if text in coloured_graph.nodes:
                
                text_colour_dist = coloured_graph.node[text]['dist']
                
                if not any(text_colour_dist):
                    
                    pass
                
                else:
                    
                    for root_hub in root_hub_list:
                        
                        root_hub_idx = root_hub_list.index(root_hub)
                    
                        if nx.has_path(coloured_graph , text, root_hub):
                            
                            shortest_path = nx.shortest_path(coloured_graph ,
                                                            text,
                                                            root_hub,
                                                            'weight')
                            total_weight = 0

                            # Add weights of every sub-path.
                            for i in range(1, len(shortest_path)):
                                sub_from, sub_to = shortest_path[i-1], shortest_path[i]
                                total_weight += \
                                    coloured_graph[sub_from][sub_to]['weight']
                
                            score[root_hub_idx] += (1/(1+total_weight)) \
                             * colour_graph.node[text]['dist'][root_hub_idx]
                    
                        else:
                            pass
            
            else:
                pass
    
        if any(score):
            
            mapping_dict[np.argmax(score)+1].append(context_id)
        
        else:
            
            pass
        
        
    return mapping_dict


def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
                 context_list: list, topic_name: str) -> dict:
    """Matches contexts to senses.
    
    Builds minimum spanning tree from graph.
    Adds up scores based on tree node distance for each token in a context 
    string and matches the context to the root hub with the highest score.
    
    Args:
        graph: Weighted undirected graph.
        root_hub_list: List of strings of root hubs (senses).
        context_list: List of sentence strings that are to be clustered.
        topic_name: String of target word, also root of MST.
    
    Returns:
        Dictionary of root hubs (senses) as keys and context IDs as values.
    """
    
    #performs minimum_spanning_tree algorithm on graph
    minimum_spanning_tree = components(graph, root_hub_list, topic_name)
    
    spaced_topic_name = topic_name.replace('_', ' ')
    context_list = [context.lower().strip().replace(spaced_topic_name, '')
                    for context in context_list]
    
    score_dict = dict() #memoisation for scores
    mapping_dict = {topic:[] for topic in range(1,len(root_hub_list)+1)}
    
    #if no sense is found for a target word, we should assume that there only is one sense
    if len(root_hub_list) == 0: 
            
        mapping_dict = {0:[i for i in range(1, len(context_list)+1)]}
        
        return mapping_dict
    
    idx = 0
    
    for context in context_list:
        
        idx += 1 #index based on position in list
    
        processed_context = nlp(context)
        
        if config.lemma == True:
            token_list = [token.lemma_ for token in processed_context] #tokens
        else:
            token_list = [token.text for token in processed_context] #tokens
        
        score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
        
        for token in token_list:
            
            if token in minimum_spanning_tree.nodes: #if word wasn't filtered out
                
                if token in score_dict: #memoisation
                        
                    new_scores = score_dict[token]
                
                else:
                    
                    new_score = score(minimum_spanning_tree, 
                                      token, root_hub_list)
                    score_dict[token] = new_score #memoisation
                    
                score_array += new_score
            
            else:
            
                pass
        
        # If disambiguator does not detect a sense, return singleton.
        if not any(score_array):
            
            pass
            
        else:
                
            # Apply sense with the highest score to context
            max_score = np.max(score_array)
            argmax_score = np.argmax(score_array)
            
            # Clusters begin at 1
            mapping_dict[argmax_score + 1].append(idx)

    return mapping_dict


def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
    """Calls induction and disambiguation functions, performs main task.

    The task is to both induce senses and match search results to them. This
    function calls in much the same way induce() and disambiguate_mst() to 
    perform these sub tasks. The result is then written to the output directory
    specified in config.py.
    
    Args:
        topic_id: Index of topic in topics.txt.
        topic_name: Target string.
        result_dict: Dictionary with topic_id as key and list of search queries
            (from results.txt) as values.
            
    """
    
    print('[a]', 'Inducing word senses for {}.'.format(topic_name))
    graph, root_hub_list, stat_dict = induce(topic_name,result_dict[topic_id])
    
    colour_rank = config.colour_rank
    mst_rank = config.mst_rank
    
    #Merges Mappings according to pipeline
    mapping_dict = dict()
    
    #matches senses to clusters
    print('[a]', 'Disambiguating results.\t('+topic_name+')')
    if colour_rank != 0:
        
        print('[a]', 'Colouring graph.\t('+topic_name+')')
        mapping_dict[colour_rank] = disambiguate_colour(graph, root_hub_list,
                                                        result_dict[topic_id])
        
    if mst_rank != 0:
        
        print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
        mapping_dict[mst_rank] = disambiguate_mst(graph,
                                                  root_hub_list,
                                                  result_dict[topic_id],
                                                  topic_name)
    
    mapping_list = [item[1] for item in sorted(mapping_dict.items())]
    mapping_count = len(mapping_list)
    
    merged_mapping_dict = mapping_list[0]
    merged_entry_count = 0
    
    for i in range(1,mapping_count):
        
        result_list = [result for result_list in merged_mapping_dict.values() 
                       for result in result_list]
        #individual mappings
        relation_list = [(topic,result) for topic in mapping_list[i].keys()
                         for result in mapping_list[i][topic]]
        
        for topic, result in relation_list:
            
            if result not in result_list:
                
                merged_entry_count += 1
                
                if topic in merged_mapping_dict:
                    merged_mapping_dict[topic].append(result)
                    
                else:
                    merged_mapping_dict[topic] = [result]
                    
    stat_dict['merge_gain'] = merged_entry_count
    
    #collect statistics from result.
    cluster_count = 0
    cluster_length_list = list()
    
    for cluster,result_list in merged_mapping_dict.items():
        
        cluster_length = len(result_list)
        
        if cluster_length != 0:
            
            cluster_count += 1
            cluster_length_list.append(cluster_length)
            
    stat_dict['mean_cluster_length'] = np.mean(cluster_length_list)
    stat_dict['cluster_count'] = cluster_count

    print('[a]', 'Writing to file.\t('+topic_name+')')
    
    output_path = config.output
    output_file_name = output_path+topic_name+'.absinth'
    
    with open(output_file_name, 'w') as output_file:

        output_file.write('subTopicID\tresultID\n')

        for cluster_id,result_list in merged_mapping_dict.items():
            for result_id in result_list:
                output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id,
                                                      topic_id, result_id)
                output_file.write(output_line)
                
    pprint.pprint(stat_dict)
    


if __name__ == '__main__':
    """Check for modifiers and call main().
    
    Only called when absinth.py is started manually. Checks for various
    modifiers, i.e. test environment and number of processes to run 
    simultaneously.
    """
    
    # If absinth.py is run in test environment.
    if '-t' in sys.argv:
        data_path = config.test
    else:
        data_path = config.dataset
        
    result_dict, topic_dict = read_dataset(data_path)
    
    # Enables manual setting of process count.
    if '-p' in sys.argv:
        process_count = int(sys.argv[sys.argv.index('-p') + 1])
    else:
        process_count = 1
    
    with Pool(process_count) as pool:
        parameter_list = [(topic_id, topic_name, result_dict)
                          for topic_id,topic_name in topic_dict.items()]
        pool.starmap(main, sorted(parameter_list)) #determineate function