(Mostly) finished Commenting.

c71dbe78 · Victor Zimmermann · 332c7e42 · c71dbe78
Commit c71dbe78 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Word Sense Induction system for SemEval 2013, Task 11
+This module performs word sense induction for a given word on a corpus and
+matches a list of contexts to each. The method to achieve this is a modified
+reimplementation of Véronis' Hyperlex (2004).
+Example:
+    The function can be called with the following command.:
+        $ python3 absinth.py
+    The function can be called with a list of modifiers.
+Modifiers:
+    '-t': Runs absinth.py on the trial path given in the config.py instead of the
+        data_path.
+    '-p n': Runs absinth.py with n concurrent processes (standard: 1). 
+.. _Association Based Semantic Induction Tools from Heidelberg
+    https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth
+"""
 import sys
 print('[A] Loading ' + sys.argv[0] + '.\n')
@@ -14,13 +37,22 @@ from multiprocessing import Pool
 from nltk.corpus import stopwords
 from copy import deepcopy
 nlp = spacy.load('en') # standard english nlp
-def read_dataset(data_path):
+def read_dataset(data_path: str) -> (dict, dict):
    """Collects topics.txt and results.txt.
+    Iterates over topics.txt and results.txt in the data path and converts them
+    to dictionaries with the ID as key and the target word / title + snippet as
+    values.
+    Args:
+        data_path: File path to directory containing topics.txt and results.txt.
+    Returns:
+        One dictionary for each file. 
    """
    results = dict()
@@ -51,7 +83,7 @@ def read_dataset(data_path):
    return results, topics
-def frequencies(target_string, search_result_list):
+def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
    """Counts occurrences of nodes and cooccurrences.
    Iterates over the corpus (and snippets provided with the task) line by line 
@@ -64,10 +96,9 @@ def frequencies(target_string, search_result_list):
        search_result_list: List of titles and snippets provided with the task.
    Returns:
-        node_freq_dict: Dictionary of occurrences of every eligible token
+        Dictionary of occurrences of every eligible token within every context
-            within every context the target occurs in.
+            the target occurs in, dictionary of occurrences of every eligible
-        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
+            tuple of tokens within every context the target occurs in.
-            tokens within every context the target occurs in.
    """
@@ -146,7 +177,8 @@ def frequencies(target_string, search_result_list):
    return node_freq_dict, edge_freq_dict
-def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
+def process_file(context_list: list, target_string: str,
+                 node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict): 
    """Updates the counts of nodes and edges for a given document and target.
    Ammends the input dictionaries with counts from each context withing the
@@ -164,8 +196,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
            tokens within every context the target occurs in.
    Returns:
-        node_freq_dict: Updated version of the input node dict.
+        Updated versions of the input node dict and input edge dict.
-        edge_freq_dict: Updated version of the input edge dict.
    """
    spaced_target_string = target_string.replace('_', ' ')
@@ -232,7 +263,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
    return node_freq_dict, edge_freq_dict
-def build_graph(node_freq_dict, edge_freq_dict):
+def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph:
    """Builds undirected weighted graph from dictionaries.
    Creates graph and appends every edge and node in the parameter dictionaries,
@@ -245,8 +276,8 @@ def build_graph(node_freq_dict, edge_freq_dict):
            tokens within every context the target occurs in.
    Returns:
-        cooccurrence_graph: Filtered undirected dice weighted small word 
+        Filtered undirected dice weighted small word cooccurrence graph for a 
-            cooccurrence graph for a given target entity.
+            given target entity.
    """
    min_node_freq = config.min_node_freq
@@ -297,7 +328,7 @@ def build_graph(node_freq_dict, edge_freq_dict):
    return cooccurrence_graph
-def root_hubs(graph, edge_freq_dict):
+def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
    """Identifies senses (root hubs) by choosing nodes with high degrees
    Selects root hubs according to the algorithm in Véronis (2004). Nodes with
@@ -310,8 +341,8 @@ def root_hubs(graph, edge_freq_dict):
        edge_freq_dict: Dictionary of weights for every tuple in our graph.
    Returns:
-        hub_list: List of root hubs, i.e. strings that are selected using the
+        List of root hubs, i.e. strings that are selected using the algorithm
-            algorithm explained above.
+            explained above.
    """
    min_neighbors = config.min_neighbors
@@ -369,7 +400,7 @@ def root_hubs(graph, edge_freq_dict):
    return hub_list
-def components(graph, root_hub_list, target_string):
+def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
    """Builds minimum spanning tree from graph and removes singletons.
    Applies components algorithm from Véronis (2004) and removes singletons.
@@ -380,8 +411,8 @@ def components(graph, root_hub_list, target_string):
        target_string: Root of minimum spanning tree.
    Returns:
-        minimum_spanning_tree: Minimum spanning tree with target as
+        Minimum spanning tree with target as root and root hubs as direct
-            root and root hubs as direct children. Singletons removed.
+            children. Singletons removed.
    """
    graph_copy = deepcopy(graph)
@@ -400,7 +431,7 @@ def components(graph, root_hub_list, target_string):
    return minimum_spanning_tree
-def score(graph, component, root_hub_list):
+def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
    """Calculate score for a given component in a minimum spanning tree.
    First the correct root for the component is chosen. If no root hub is
@@ -414,8 +445,7 @@ def score(graph, component, root_hub_list):
        root_hub_list: List of strings of root hubs (senses) of original graph.
    Returns:
-        score_array: Array with one score for the correct root hub and filled 
+        Array with one score for the correct root hub and filled with zeroes.
-            with zeroes..
    """
    root_hub_count = len(root_hub_list)
@@ -451,7 +481,7 @@ def score(graph, component, root_hub_list):
    return score_array
-def induce(topic_name, result_list):
+def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
    """Induces word senses for a given topic from corpus.
    Counts frequencies from corpus and search result list, builds graph from
@@ -463,8 +493,9 @@ def induce(topic_name, result_list):
        result_list: List of search result (context) strings.
    Returns:
-        root_hub_list: List of root hub strings (senses).
+        A cooccurrence graph,
-        stat dict: Various statistics.
+            a list of root hub strings (senses) 
+            and dictionary of various statistics.
    """
    stat_dict = dict()
@@ -518,7 +549,8 @@ def induce(topic_name, result_list):
        return graph, root_hub_list, stat_dict
-def disambiguate(graph, root_hub_list, context_list, topic_name):
+def disambiguate(graph: nx.Graph, root_hub_list: list,
+                 context_list: list, topic_name: str) -> dict:
    """Matches contexts to senses.
    Builds minimum spanning tree from graph.
@@ -532,8 +564,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
        topic_name: String of target word, also root of MST.
    Returns:
-        mapping_dict: Dictionary of root hubs (senses) as keys and context ids
+        Dictionary of root hubs (senses) as keys and context IDs as values.
-            as values.
    """
    #performs minimum_spanning_tree algorithm on graph
@@ -604,7 +635,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
    return mapping_dict
-def main(topic_id, topic_name, result_dict):
+def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
    """Calls induction and disambiguation functions, performs main task.
    The task is to both induce senses and match search results to them. This
@@ -618,8 +649,6 @@ def main(topic_id, topic_name, result_dict):
        result_dict: Dictionary with topic_id as key and list of search queries
            (from results.txt) as values.
-    Returns:
-        None
    """
    print('[a]', 'Inducing word senses for {}.'.format(topic_name))