From c71dbe782bb957c2f829c549696239a01fc851ec Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Mon, 19 Mar 2018 19:47:58 +0100
Subject: [PATCH] (Mostly) finished Commenting.

---
 src/absinth.py | 89 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 30 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index 9a33af7..dba9702 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -1,4 +1,27 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""Word Sense Induction system for SemEval 2013, Task 11
+
+This module performs word sense induction for a given word on a corpus and
+matches a list of contexts to each. The method to achieve this is a modified
+reimplementation of Véronis' Hyperlex (2004).
+
+Example:
+    The function can be called with the following command.:
+    
+        $ python3 absinth.py
+        
+    The function can be called with a list of modifiers.
+    
+Modifiers:
+    '-t': Runs absinth.py on the trial path given in the config.py instead of the
+        data_path.
+    '-p n': Runs absinth.py with n concurrent processes (standard: 1). 
+
+.. _Association Based Semantic Induction Tools from Heidelberg
+    https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth
+
+"""
 
 import sys
 print('[A] Loading ' + sys.argv[0] + '.\n')
@@ -14,13 +37,22 @@ from multiprocessing import Pool
 from nltk.corpus import stopwords
 from copy import deepcopy
 
+
 nlp = spacy.load('en') # standard english nlp
 
 
-def read_dataset(data_path):
+def read_dataset(data_path: str) -> (dict, dict):
     """Collects topics.txt and results.txt.
     
+    Iterates over topics.txt and results.txt in the data path and converts them
+    to dictionaries with the ID as key and the target word / title + snippet as
+    values.
     
+    Args:
+        data_path: File path to directory containing topics.txt and results.txt.
+        
+    Returns:
+        One dictionary for each file. 
     """
     
     results = dict()
@@ -51,7 +83,7 @@ def read_dataset(data_path):
     return results, topics
 
 
-def frequencies(target_string, search_result_list):
+def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
     """Counts occurrences of nodes and cooccurrences.
     
     Iterates over the corpus (and snippets provided with the task) line by line 
@@ -64,10 +96,9 @@ def frequencies(target_string, search_result_list):
         search_result_list: List of titles and snippets provided with the task.
         
     Returns:
-        node_freq_dict: Dictionary of occurrences of every eligible token
-            within every context the target occurs in.
-        edge_freq_dict: Dictionary of occurrences of every eligible tuple of
-            tokens within every context the target occurs in.
+        Dictionary of occurrences of every eligible token within every context
+            the target occurs in, dictionary of occurrences of every eligible
+            tuple of tokens within every context the target occurs in.
     
     """
 
@@ -146,7 +177,8 @@ def frequencies(target_string, search_result_list):
     return node_freq_dict, edge_freq_dict
 
 
-def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
+def process_file(context_list: list, target_string: str,
+                 node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict): 
     """Updates the counts of nodes and edges for a given document and target.
     
     Ammends the input dictionaries with counts from each context withing the
@@ -164,8 +196,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
             tokens within every context the target occurs in.
     
     Returns:
-        node_freq_dict: Updated version of the input node dict.
-        edge_freq_dict: Updated version of the input edge dict.
+        Updated versions of the input node dict and input edge dict.
     """
     
     spaced_target_string = target_string.replace('_', ' ')
@@ -232,7 +263,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
     return node_freq_dict, edge_freq_dict
 
 
-def build_graph(node_freq_dict, edge_freq_dict):
+def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph:
     """Builds undirected weighted graph from dictionaries.
     
     Creates graph and appends every edge and node in the parameter dictionaries,
@@ -245,8 +276,8 @@ def build_graph(node_freq_dict, edge_freq_dict):
             tokens within every context the target occurs in.
     
     Returns:
-        cooccurrence_graph: Filtered undirected dice weighted small word 
-            cooccurrence graph for a given target entity.
+        Filtered undirected dice weighted small word cooccurrence graph for a 
+            given target entity.
     """
     
     min_node_freq = config.min_node_freq
@@ -297,7 +328,7 @@ def build_graph(node_freq_dict, edge_freq_dict):
     return cooccurrence_graph
 
 
-def root_hubs(graph, edge_freq_dict):
+def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
     """Identifies senses (root hubs) by choosing nodes with high degrees
     
     Selects root hubs according to the algorithm in Véronis (2004). Nodes with
@@ -310,8 +341,8 @@ def root_hubs(graph, edge_freq_dict):
         edge_freq_dict: Dictionary of weights for every tuple in our graph.
         
     Returns:
-        hub_list: List of root hubs, i.e. strings that are selected using the
-            algorithm explained above.
+        List of root hubs, i.e. strings that are selected using the algorithm
+            explained above.
     """
     
     min_neighbors = config.min_neighbors
@@ -369,7 +400,7 @@ def root_hubs(graph, edge_freq_dict):
     return hub_list
 
 
-def components(graph, root_hub_list, target_string):
+def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
     """Builds minimum spanning tree from graph and removes singletons.
     
     Applies components algorithm from Véronis (2004) and removes singletons.
@@ -380,8 +411,8 @@ def components(graph, root_hub_list, target_string):
         target_string: Root of minimum spanning tree.
         
     Returns:
-        minimum_spanning_tree: Minimum spanning tree with target as
-            root and root hubs as direct children. Singletons removed.
+        Minimum spanning tree with target as root and root hubs as direct
+            children. Singletons removed.
     """
     
     graph_copy = deepcopy(graph)
@@ -400,7 +431,7 @@ def components(graph, root_hub_list, target_string):
     return minimum_spanning_tree
 
 
-def score(graph, component, root_hub_list):
+def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
     """Calculate score for a given component in a minimum spanning tree.
     
     First the correct root for the component is chosen. If no root hub is
@@ -414,8 +445,7 @@ def score(graph, component, root_hub_list):
         root_hub_list: List of strings of root hubs (senses) of original graph.
     
     Returns:
-        score_array: Array with one score for the correct root hub and filled 
-            with zeroes..
+        Array with one score for the correct root hub and filled with zeroes.
     """
     
     root_hub_count = len(root_hub_list)
@@ -451,7 +481,7 @@ def score(graph, component, root_hub_list):
     return score_array
 
 
-def induce(topic_name, result_list):
+def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
     """Induces word senses for a given topic from corpus.
     
     Counts frequencies from corpus and search result list, builds graph from
@@ -463,8 +493,9 @@ def induce(topic_name, result_list):
         result_list: List of search result (context) strings.
         
     Returns:
-        root_hub_list: List of root hub strings (senses).
-        stat dict: Various statistics.
+        A cooccurrence graph,
+            a list of root hub strings (senses) 
+            and dictionary of various statistics.
     """
     
     stat_dict = dict()
@@ -518,7 +549,8 @@ def induce(topic_name, result_list):
         return graph, root_hub_list, stat_dict
 
 
-def disambiguate(graph, root_hub_list, context_list, topic_name):
+def disambiguate(graph: nx.Graph, root_hub_list: list,
+                 context_list: list, topic_name: str) -> dict:
     """Matches contexts to senses.
     
     Builds minimum spanning tree from graph.
@@ -532,8 +564,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
         topic_name: String of target word, also root of MST.
     
     Returns:
-        mapping_dict: Dictionary of root hubs (senses) as keys and context ids
-            as values.
+        Dictionary of root hubs (senses) as keys and context IDs as values.
     """
     
     #performs minimum_spanning_tree algorithm on graph
@@ -604,7 +635,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
     return mapping_dict
 
 
-def main(topic_id, topic_name, result_dict):
+def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
     """Calls induction and disambiguation functions, performs main task.
 
     The task is to both induce senses and match search results to them. This
@@ -618,8 +649,6 @@ def main(topic_id, topic_name, result_dict):
         result_dict: Dictionary with topic_id as key and list of search queries
             (from results.txt) as values.
             
-    Returns:
-        None
     """
     
     print('[a]', 'Inducing word senses for {}.'.format(topic_name))
-- 
GitLab