Skip to content
Snippets Groups Projects
Commit c71dbe78 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

(Mostly) finished Commenting.

parent 332c7e42
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Word Sense Induction system for SemEval 2013, Task 11
This module performs word sense induction for a given word on a corpus and
matches a list of contexts to each. The method to achieve this is a modified
reimplementation of Véronis' Hyperlex (2004).
Example:
The function can be called with the following command.:
$ python3 absinth.py
The function can be called with a list of modifiers.
Modifiers:
'-t': Runs absinth.py on the trial path given in the config.py instead of the
data_path.
'-p n': Runs absinth.py with n concurrent processes (standard: 1).
.. _Association Based Semantic Induction Tools from Heidelberg
https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth
"""
import sys import sys
print('[A] Loading ' + sys.argv[0] + '.\n') print('[A] Loading ' + sys.argv[0] + '.\n')
...@@ -14,13 +37,22 @@ from multiprocessing import Pool ...@@ -14,13 +37,22 @@ from multiprocessing import Pool
from nltk.corpus import stopwords from nltk.corpus import stopwords
from copy import deepcopy from copy import deepcopy
nlp = spacy.load('en') # standard english nlp nlp = spacy.load('en') # standard english nlp
def read_dataset(data_path): def read_dataset(data_path: str) -> (dict, dict):
"""Collects topics.txt and results.txt. """Collects topics.txt and results.txt.
Iterates over topics.txt and results.txt in the data path and converts them
to dictionaries with the ID as key and the target word / title + snippet as
values.
Args:
data_path: File path to directory containing topics.txt and results.txt.
Returns:
One dictionary for each file.
""" """
results = dict() results = dict()
...@@ -51,7 +83,7 @@ def read_dataset(data_path): ...@@ -51,7 +83,7 @@ def read_dataset(data_path):
return results, topics return results, topics
def frequencies(target_string, search_result_list): def frequencies(target_string: str, search_result_list: list) -> (dict, dict):
"""Counts occurrences of nodes and cooccurrences. """Counts occurrences of nodes and cooccurrences.
Iterates over the corpus (and snippets provided with the task) line by line Iterates over the corpus (and snippets provided with the task) line by line
...@@ -64,10 +96,9 @@ def frequencies(target_string, search_result_list): ...@@ -64,10 +96,9 @@ def frequencies(target_string, search_result_list):
search_result_list: List of titles and snippets provided with the task. search_result_list: List of titles and snippets provided with the task.
Returns: Returns:
node_freq_dict: Dictionary of occurrences of every eligible token Dictionary of occurrences of every eligible token within every context
within every context the target occurs in. the target occurs in, dictionary of occurrences of every eligible
edge_freq_dict: Dictionary of occurrences of every eligible tuple of tuple of tokens within every context the target occurs in.
tokens within every context the target occurs in.
""" """
...@@ -146,7 +177,8 @@ def frequencies(target_string, search_result_list): ...@@ -146,7 +177,8 @@ def frequencies(target_string, search_result_list):
return node_freq_dict, edge_freq_dict return node_freq_dict, edge_freq_dict
def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): def process_file(context_list: list, target_string: str,
node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict):
"""Updates the counts of nodes and edges for a given document and target. """Updates the counts of nodes and edges for a given document and target.
Ammends the input dictionaries with counts from each context withing the Ammends the input dictionaries with counts from each context withing the
...@@ -164,8 +196,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): ...@@ -164,8 +196,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
tokens within every context the target occurs in. tokens within every context the target occurs in.
Returns: Returns:
node_freq_dict: Updated version of the input node dict. Updated versions of the input node dict and input edge dict.
edge_freq_dict: Updated version of the input edge dict.
""" """
spaced_target_string = target_string.replace('_', ' ') spaced_target_string = target_string.replace('_', ' ')
...@@ -232,7 +263,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): ...@@ -232,7 +263,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
return node_freq_dict, edge_freq_dict return node_freq_dict, edge_freq_dict
def build_graph(node_freq_dict, edge_freq_dict): def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph:
"""Builds undirected weighted graph from dictionaries. """Builds undirected weighted graph from dictionaries.
Creates graph and appends every edge and node in the parameter dictionaries, Creates graph and appends every edge and node in the parameter dictionaries,
...@@ -245,8 +276,8 @@ def build_graph(node_freq_dict, edge_freq_dict): ...@@ -245,8 +276,8 @@ def build_graph(node_freq_dict, edge_freq_dict):
tokens within every context the target occurs in. tokens within every context the target occurs in.
Returns: Returns:
cooccurrence_graph: Filtered undirected dice weighted small word Filtered undirected dice weighted small word cooccurrence graph for a
cooccurrence graph for a given target entity. given target entity.
""" """
min_node_freq = config.min_node_freq min_node_freq = config.min_node_freq
...@@ -297,7 +328,7 @@ def build_graph(node_freq_dict, edge_freq_dict): ...@@ -297,7 +328,7 @@ def build_graph(node_freq_dict, edge_freq_dict):
return cooccurrence_graph return cooccurrence_graph
def root_hubs(graph, edge_freq_dict): def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list:
"""Identifies senses (root hubs) by choosing nodes with high degrees """Identifies senses (root hubs) by choosing nodes with high degrees
Selects root hubs according to the algorithm in Véronis (2004). Nodes with Selects root hubs according to the algorithm in Véronis (2004). Nodes with
...@@ -310,8 +341,8 @@ def root_hubs(graph, edge_freq_dict): ...@@ -310,8 +341,8 @@ def root_hubs(graph, edge_freq_dict):
edge_freq_dict: Dictionary of weights for every tuple in our graph. edge_freq_dict: Dictionary of weights for every tuple in our graph.
Returns: Returns:
hub_list: List of root hubs, i.e. strings that are selected using the List of root hubs, i.e. strings that are selected using the algorithm
algorithm explained above. explained above.
""" """
min_neighbors = config.min_neighbors min_neighbors = config.min_neighbors
...@@ -369,7 +400,7 @@ def root_hubs(graph, edge_freq_dict): ...@@ -369,7 +400,7 @@ def root_hubs(graph, edge_freq_dict):
return hub_list return hub_list
def components(graph, root_hub_list, target_string): def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph:
"""Builds minimum spanning tree from graph and removes singletons. """Builds minimum spanning tree from graph and removes singletons.
Applies components algorithm from Véronis (2004) and removes singletons. Applies components algorithm from Véronis (2004) and removes singletons.
...@@ -380,8 +411,8 @@ def components(graph, root_hub_list, target_string): ...@@ -380,8 +411,8 @@ def components(graph, root_hub_list, target_string):
target_string: Root of minimum spanning tree. target_string: Root of minimum spanning tree.
Returns: Returns:
minimum_spanning_tree: Minimum spanning tree with target as Minimum spanning tree with target as root and root hubs as direct
root and root hubs as direct children. Singletons removed. children. Singletons removed.
""" """
graph_copy = deepcopy(graph) graph_copy = deepcopy(graph)
...@@ -400,7 +431,7 @@ def components(graph, root_hub_list, target_string): ...@@ -400,7 +431,7 @@ def components(graph, root_hub_list, target_string):
return minimum_spanning_tree return minimum_spanning_tree
def score(graph, component, root_hub_list): def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array:
"""Calculate score for a given component in a minimum spanning tree. """Calculate score for a given component in a minimum spanning tree.
First the correct root for the component is chosen. If no root hub is First the correct root for the component is chosen. If no root hub is
...@@ -414,8 +445,7 @@ def score(graph, component, root_hub_list): ...@@ -414,8 +445,7 @@ def score(graph, component, root_hub_list):
root_hub_list: List of strings of root hubs (senses) of original graph. root_hub_list: List of strings of root hubs (senses) of original graph.
Returns: Returns:
score_array: Array with one score for the correct root hub and filled Array with one score for the correct root hub and filled with zeroes.
with zeroes..
""" """
root_hub_count = len(root_hub_list) root_hub_count = len(root_hub_list)
...@@ -451,7 +481,7 @@ def score(graph, component, root_hub_list): ...@@ -451,7 +481,7 @@ def score(graph, component, root_hub_list):
return score_array return score_array
def induce(topic_name, result_list): def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
"""Induces word senses for a given topic from corpus. """Induces word senses for a given topic from corpus.
Counts frequencies from corpus and search result list, builds graph from Counts frequencies from corpus and search result list, builds graph from
...@@ -463,8 +493,9 @@ def induce(topic_name, result_list): ...@@ -463,8 +493,9 @@ def induce(topic_name, result_list):
result_list: List of search result (context) strings. result_list: List of search result (context) strings.
Returns: Returns:
root_hub_list: List of root hub strings (senses). A cooccurrence graph,
stat dict: Various statistics. a list of root hub strings (senses)
and dictionary of various statistics.
""" """
stat_dict = dict() stat_dict = dict()
...@@ -518,7 +549,8 @@ def induce(topic_name, result_list): ...@@ -518,7 +549,8 @@ def induce(topic_name, result_list):
return graph, root_hub_list, stat_dict return graph, root_hub_list, stat_dict
def disambiguate(graph, root_hub_list, context_list, topic_name): def disambiguate(graph: nx.Graph, root_hub_list: list,
context_list: list, topic_name: str) -> dict:
"""Matches contexts to senses. """Matches contexts to senses.
Builds minimum spanning tree from graph. Builds minimum spanning tree from graph.
...@@ -532,8 +564,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name): ...@@ -532,8 +564,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
topic_name: String of target word, also root of MST. topic_name: String of target word, also root of MST.
Returns: Returns:
mapping_dict: Dictionary of root hubs (senses) as keys and context ids Dictionary of root hubs (senses) as keys and context IDs as values.
as values.
""" """
#performs minimum_spanning_tree algorithm on graph #performs minimum_spanning_tree algorithm on graph
...@@ -604,7 +635,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name): ...@@ -604,7 +635,7 @@ def disambiguate(graph, root_hub_list, context_list, topic_name):
return mapping_dict return mapping_dict
def main(topic_id, topic_name, result_dict): def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
"""Calls induction and disambiguation functions, performs main task. """Calls induction and disambiguation functions, performs main task.
The task is to both induce senses and match search results to them. This The task is to both induce senses and match search results to them. This
...@@ -618,8 +649,6 @@ def main(topic_id, topic_name, result_dict): ...@@ -618,8 +649,6 @@ def main(topic_id, topic_name, result_dict):
result_dict: Dictionary with topic_id as key and list of search queries result_dict: Dictionary with topic_id as key and list of search queries
(from results.txt) as values. (from results.txt) as values.
Returns:
None
""" """
print('[a]', 'Inducing word senses for {}.'.format(topic_name)) print('[a]', 'Inducing word senses for {}.'.format(topic_name))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment