Skip to content
Snippets Groups Projects
Commit c71dbe78 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

(Mostly) finished Commenting.

parent 332c7e42
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Word Sense Induction system for SemEval 2013, Task 11
This module performs word sense induction for a given word on a corpus and
matches a list of contexts to each. The method to achieve this is a modified
reimplementation of Véronis' Hyperlex (2004).
Example:
The function can be called with the following command.:
$ python3 absinth.py
The function can be called with a list of modifiers.
Modifiers:
'-t': Runs absinth.py on the trial path given in the config.py instead of the
data_path.
'-p n': Runs absinth.py with n concurrent processes (standard: 1).
.. _Association Based Semantic Induction Tools from Heidelberg
https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth
"""
import sys
print('[A] Loading ' + sys.argv[0] + '.\n')
......@@ -14,13 +37,22 @@ from multiprocessing import Pool
from nltk.corpus import stopwords
from copy import deepcopy
nlp = spacy.load('en') # standard english nlp
def read_dataset(data_path):
def read_dataset(data_path: str) -> (dict, dict):
"""Collects topics.txt and results.txt.
Iterates over topics.txt and results.txt in the data path and converts them
to dictionaries with the ID as key and the target word / title + snippet as
values.
Args:
data_path: File path to directory containing topics.txt and results.txt.
Returns:
One dictionary for each file.
"""
results = dict()
......@@ -51,7 +83,7 @@ def read_dataset(data_path):
return results, topics
def frequencies(target_string, search_result_list):
def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
"""Counts occurrences of nodes and cooccurrences.
Iterates over the corpus (and snippets provided with the task) line by line
......@@ -64,10 +96,9 @@ def frequencies(target_string, search_result_list):
search_result_list: List of titles and snippets provided with the task.
Returns:
node_freq_dict: Dictionary of occurrences of every eligible token
within every context the target occurs in.
edge_freq_dict: Dictionary of occurrences of every eligible tuple of
tokens within every context the target occurs in.
Dictionary of occurrences of every eligible token within every context
the target occurs in, dictionary of occurrences of every eligible
tuple of tokens within every context the target occurs in.
"""
......@@ -146,7 +177,8 @@ def frequencies(target_string, search_result_list):
return node_freq_dict, edge_freq_dict
def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
def process_file(context_list: list, target_string: str,
node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict):
"""Updates the counts of nodes and edges for a given document and target.
Ammends the input dictionaries with counts from each context withing the
......@@ -164,8 +196,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
tokens within every context the target occurs in.
Returns:
node_freq_dict: Updated version of the input node dict.
edge_freq_dict: Updated version of the input edge dict.
Updated versions of the input node dict and input edge dict.
"""
spaced_target_string = target_string.replace('_', ' ')
......@@ -232,7 +263,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
return node_freq_dict, edge_freq_dict
def build_graph(node_freq_dict, edge_freq_dict):
def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph:
"""Builds undirected weighted graph from dictionaries.
Creates graph and appends every edge and node in the parameter dictionaries,
......@@ -245,8 +276,8 @@ def build_graph(node_freq_dict, edge_freq_dict):
tokens within every context the target occurs in.
Returns:
cooccurrence_graph: Filtered undirected dice weighted small word
cooccurrence graph for a given target entity.
Filtered undirected dice weighted small word cooccurrence graph for a
given target entity.
"""
min_node_freq = config.min_node_freq
......@@ -297,7 +328,7 @@ def build_graph(node_freq_dict, edge_freq_dict):
return cooccurrence_graph
def root_hubs(graph, edge_freq_dict):
def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
"""Identifies senses (root hubs) by choosing nodes with high degrees
Selects root hubs according to the algorithm in Véronis (2004). Nodes with
......@@ -310,8 +341,8 @@ def root_hubs(graph, edge_freq_dict):
edge_freq_dict: Dictionary of weights for every tuple in our graph.
Returns:
hub_list: List of root hubs, i.e. strings that are selected using the
algorithm explained above.
List of root hubs, i.e. strings that are selected using the algorithm
explained above.
"""
min_neighbors = config.min_neighbors
......@@ -369,7 +400,7 @@ def root_hubs(graph, edge_freq_dict):
return hub_list
def components(graph, root_hub_list, target_string):
def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
"""Builds minimum spanning tree from graph and removes singletons.
Applies components algorithm from Véronis (2004) and removes singletons.
......@@ -380,8 +411,8 @@ def components(graph, root_hub_list, target_string):
target_string: Root of minimum spanning tree.
Returns:
minimum_spanning_tree: Minimum spanning tree with target as
root and root hubs as direct children. Singletons removed.
Minimum spanning tree with target as root and root hubs as direct
children. Singletons removed.
"""
graph_copy = deepcopy(graph)
......@@ -400,7 +431,7 @@ def components(graph, root_hub_list, target_string):
return minimum_spanning_tree
def score(graph, component, root_hub_list):
def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
"""Calculate score for a given component in a minimum spanning tree.
First the correct root for the component is chosen. If no root hub is
......@@ -414,8 +445,7 @@ def score(graph, component, root_hub_list):
root_hub_list: List of strings of root hubs (senses) of original graph.
Returns:
score_array: Array with one score for the correct root hub and filled
with zeroes..
Array with one score for the correct root hub and filled with zeroes.
"""
root_hub_count = len(root_hub_list)
......@@ -451,7 +481,7 @@ def score(graph, component, root_hub_list):
return score_array
def induce(topic_name, result_list):
def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
"""Induces word senses for a given topic from corpus.
Counts frequencies from corpus and search result list, builds graph from
......@@ -463,8 +493,9 @@ def induce(topic_name, result_list):
result_list: List of search result (context) strings.
Returns:
root_hub_list: List of root hub strings (senses).
stat dict: Various statistics.
A cooccurrence graph,
a list of root hub strings (senses)
and dictionary of various statistics.
"""
stat_dict = dict()
......@@ -518,7 +549,8 @@ def induce(topic_name, result_list):
return graph, root_hub_list, stat_dict
def disambiguate(graph, root_hub_list, context_list, topic_name):
def disambiguate(graph: nx.Graph, root_hub_list: list,
context_list: list, topic_name: str) -> dict:
"""Matches contexts to senses.
Builds minimum spanning tree from graph.
......@@ -532,8 +564,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
topic_name: String of target word, also root of MST.
Returns:
mapping_dict: Dictionary of root hubs (senses) as keys and context ids
as values.
Dictionary of root hubs (senses) as keys and context IDs as values.
"""
#performs minimum_spanning_tree algorithm on graph
......@@ -604,7 +635,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
return mapping_dict
def main(topic_id, topic_name, result_dict):
def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
"""Calls induction and disambiguation functions, performs main task.
The task is to both induce senses and match search results to them. This
......@@ -618,8 +649,6 @@ def main(topic_id, topic_name, result_dict):
result_dict: Dictionary with topic_id as key and list of search queries
(from results.txt) as values.
Returns:
None
"""
print('[a]', 'Inducing word senses for {}.'.format(topic_name))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment