diff --git a/src/absinth.py b/src/absinth.py index 9a33af7447f397e5ab803af6396af42cca329d11..dba97025dd8e8b5c3b02712ed9db493dd278b880 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -1,4 +1,27 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Word Sense Induction system for SemEval 2013, Task 11 + +This module performs word sense induction for a given word on a corpus and +matches a list of contexts to each. The method to achieve this is a modified +reimplementation of Véronis' Hyperlex (2004). + +Example: + The function can be called with the following command.: + + $ python3 absinth.py + + The function can be called with a list of modifiers. + +Modifiers: + '-t': Runs absinth.py on the trial path given in the config.py instead of the + data_path. + '-p n': Runs absinth.py with n concurrent processes (standard: 1). + +.. _Association Based Semantic Induction Tools from Heidelberg + https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth + +""" import sys print('[A] Loading ' + sys.argv[0] + '.\n') @@ -14,13 +37,22 @@ from multiprocessing import Pool from nltk.corpus import stopwords from copy import deepcopy + nlp = spacy.load('en') # standard english nlp -def read_dataset(data_path): +def read_dataset(data_path: str) -> (dict, dict): """Collects topics.txt and results.txt. + Iterates over topics.txt and results.txt in the data path and converts them + to dictionaries with the ID as key and the target word / title + snippet as + values. + Args: + data_path: File path to directory containing topics.txt and results.txt. + + Returns: + One dictionary for each file. """ results = dict() @@ -51,7 +83,7 @@ def read_dataset(data_path): return results, topics -def frequencies(target_string, search_result_list): +def frequencies(target_string: str, search_result_list: list) -> (dict, dict): """Counts occurrences of nodes and cooccurrences. Iterates over the corpus (and snippets provided with the task) line by line @@ -64,10 +96,9 @@ def frequencies(target_string, search_result_list): search_result_list: List of titles and snippets provided with the task. Returns: - node_freq_dict: Dictionary of occurrences of every eligible token - within every context the target occurs in. - edge_freq_dict: Dictionary of occurrences of every eligible tuple of - tokens within every context the target occurs in. + Dictionary of occurrences of every eligible token within every context + the target occurs in, dictionary of occurrences of every eligible + tuple of tokens within every context the target occurs in. """ @@ -146,7 +177,8 @@ def frequencies(target_string, search_result_list): return node_freq_dict, edge_freq_dict -def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): +def process_file(context_list: list, target_string: str, + node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict): """Updates the counts of nodes and edges for a given document and target. Ammends the input dictionaries with counts from each context withing the @@ -164,8 +196,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): tokens within every context the target occurs in. Returns: - node_freq_dict: Updated version of the input node dict. - edge_freq_dict: Updated version of the input edge dict. + Updated versions of the input node dict and input edge dict. """ spaced_target_string = target_string.replace('_', ' ') @@ -232,7 +263,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): return node_freq_dict, edge_freq_dict -def build_graph(node_freq_dict, edge_freq_dict): +def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph: """Builds undirected weighted graph from dictionaries. Creates graph and appends every edge and node in the parameter dictionaries, @@ -245,8 +276,8 @@ def build_graph(node_freq_dict, edge_freq_dict): tokens within every context the target occurs in. Returns: - cooccurrence_graph: Filtered undirected dice weighted small word - cooccurrence graph for a given target entity. + Filtered undirected dice weighted small word cooccurrence graph for a + given target entity. """ min_node_freq = config.min_node_freq @@ -297,7 +328,7 @@ def build_graph(node_freq_dict, edge_freq_dict): return cooccurrence_graph -def root_hubs(graph, edge_freq_dict): +def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list: """Identifies senses (root hubs) by choosing nodes with high degrees Selects root hubs according to the algorithm in Véronis (2004). Nodes with @@ -310,8 +341,8 @@ def root_hubs(graph, edge_freq_dict): edge_freq_dict: Dictionary of weights for every tuple in our graph. Returns: - hub_list: List of root hubs, i.e. strings that are selected using the - algorithm explained above. + List of root hubs, i.e. strings that are selected using the algorithm + explained above. """ min_neighbors = config.min_neighbors @@ -369,7 +400,7 @@ def root_hubs(graph, edge_freq_dict): return hub_list -def components(graph, root_hub_list, target_string): +def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph: """Builds minimum spanning tree from graph and removes singletons. Applies components algorithm from Véronis (2004) and removes singletons. @@ -380,8 +411,8 @@ def components(graph, root_hub_list, target_string): target_string: Root of minimum spanning tree. Returns: - minimum_spanning_tree: Minimum spanning tree with target as - root and root hubs as direct children. Singletons removed. + Minimum spanning tree with target as root and root hubs as direct + children. Singletons removed. """ graph_copy = deepcopy(graph) @@ -400,7 +431,7 @@ def components(graph, root_hub_list, target_string): return minimum_spanning_tree -def score(graph, component, root_hub_list): +def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array: """Calculate score for a given component in a minimum spanning tree. First the correct root for the component is chosen. If no root hub is @@ -414,8 +445,7 @@ def score(graph, component, root_hub_list): root_hub_list: List of strings of root hubs (senses) of original graph. Returns: - score_array: Array with one score for the correct root hub and filled - with zeroes.. + Array with one score for the correct root hub and filled with zeroes. """ root_hub_count = len(root_hub_list) @@ -451,7 +481,7 @@ def score(graph, component, root_hub_list): return score_array -def induce(topic_name, result_list): +def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): """Induces word senses for a given topic from corpus. Counts frequencies from corpus and search result list, builds graph from @@ -463,8 +493,9 @@ def induce(topic_name, result_list): result_list: List of search result (context) strings. Returns: - root_hub_list: List of root hub strings (senses). - stat dict: Various statistics. + A cooccurrence graph, + a list of root hub strings (senses) + and dictionary of various statistics. """ stat_dict = dict() @@ -518,7 +549,8 @@ def induce(topic_name, result_list): return graph, root_hub_list, stat_dict -def disambiguate(graph, root_hub_list, context_list, topic_name): +def disambiguate(graph: nx.Graph, root_hub_list: list, + context_list: list, topic_name: str) -> dict: """Matches contexts to senses. Builds minimum spanning tree from graph. @@ -532,8 +564,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name): topic_name: String of target word, also root of MST. Returns: - mapping_dict: Dictionary of root hubs (senses) as keys and context ids - as values. + Dictionary of root hubs (senses) as keys and context IDs as values. """ #performs minimum_spanning_tree algorithm on graph @@ -604,7 +635,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name): return mapping_dict -def main(topic_id, topic_name, result_dict): +def main(topic_id: int, topic_name: str, result_dict: dict) -> None: """Calls induction and disambiguation functions, performs main task. The task is to both induce senses and match search results to them. This @@ -618,8 +649,6 @@ def main(topic_id, topic_name, result_dict): result_dict: Dictionary with topic_id as key and list of search queries (from results.txt) as values. - Returns: - None """ print('[a]', 'Inducing word senses for {}.'.format(topic_name))