#!/usr/bin/env python3 # -*- coding: utf-8 -*- """Word Sense Induction system for SemEval 2013, Task 11 This module performs word sense induction for a given word on a corpus and matches a list of contexts to each. The method to achieve this is a modified reimplementation of Véronis' Hyperlex (2004). Example: The function can be called with the following command.: $ python3 absinth.py The function can be called with a list of modifiers. Modifiers: '-t': Runs absinth.py on the trial path given in the config.py instead of the data_path. '-p n': Runs absinth.py with n concurrent processes (standard: 1). .. _Association Based Semantic Induction Tools from Heidelberg https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth """ import sys print('[A] Loading ' + sys.argv[0] + '.\n') import config import networkx as nx # for visualisation import numpy as np import os # for reading files import pprint import re import spacy # for nlp from multiprocessing import Pool from copy import deepcopy nlp = spacy.load('en') # standard english nlp def read_dataset(data_path: str) -> (dict, dict): """Collects topics.txt and results.txt. Iterates over topics.txt and results.txt in the data path and converts them to dictionaries with the ID as key and the target word / title + snippet as values. Args: data_path: File path to directory containing topics.txt and results.txt. Returns: One dictionary for each file. """ results = dict() with open(data_path+'results.txt', 'r') as results_file: for line in results_file.readlines()[1:]: l = line.split('\t') id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index if id1 not in results: results[id1]=list() results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored # topics.txt is a list of target words topics = dict() with open(data_path+'topics.txt', 'r') as topics_file: for line in topics_file.readlines()[1:]: l = line.split('\t') topics[l[0]] = l[1].strip() return results, topics def frequencies(target_string: str, search_result_list: list) -> (dict, dict): """Counts occurrences of nodes and cooccurrences. Iterates over the corpus (and snippets provided with the task) line by line and counts every token and tuple of tokens within a line (context). These tokens is filtered by stop words, pos tags and context length. Args: target_string: contexts are selected if they contain this string. For further processing this string is removed from the contexts. search_result_list: List of titles and snippets provided with the task. Returns: Dictionary of occurrences of every eligible token within every context the target occurs in, dictionary of occurrences of every eligible tuple of tokens within every context the target occurs in. """ corpus_path = config.corpus max_node_count = config.max_nodes max_edge_count = config.max_edges bracketed_target_string = '('+target_string+')' # Remove unnecessary tokens from snippets. _search_result_list = list() for r in search_result_list: r = r.replace('<b>', '') r = r.replace('</b>', '') r = r.replace(r'\\', '') r = r.strip() _search_result_list.append(r) # Initialise frequencies with counts from results. node_freq_dict, edge_freq_dict = process_file(_search_result_list, target_string, dict(), dict()) corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)] corpus_size = len(corpus_file_path_list) processed_file_count = 0 for corpus_file_path in corpus_file_path_list: node_count = len(node_freq_dict) edge_count = len(edge_freq_dict) # Print update after every 11th of the corpus is parsed. if processed_file_count % int(corpus_size/11) == 0: file_ratio = processed_file_count / corpus_size max_node_ratio = node_count / max_node_count max_edge_ratio = edge_count / max_edge_count ratios = [file_ratio, max_node_ratio, max_edge_ratio] # Use ratio closest to 100%. highest_ratio = int((max(ratios))*100) print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio, node_count, edge_count, bracketed_target_string)) if node_count > max_node_count: print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, edge_count, bracketed_target_string)) return node_freq_dict, edge_freq_dict if edge_count > max_edge_count: print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, edge_count, bracketed_target_string)) return node_freq_dict, edge_freq_dict with open(corpus_file_path, 'r') as corpus_file: node_freq_dict, edge_freq_dict = process_file(corpus_file, target_string, node_freq_dict, edge_freq_dict) processed_file_count += 1 print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, edge_count, bracketed_target_string)) return node_freq_dict, edge_freq_dict def process_file(context_list: list, target_string: str, node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict): """Updates the counts of nodes and edges for a given document and target. Ammends the input dictionaries with counts from each context withing the list of contexts. Furthermore filters out small contexts and tokens from the stopword list or with wrong pos tags. Args: context_list: List of contexts (lines, paragraphs) that are to be considered for updating the counting dictionaries. target_string: Target string for filtering out every context that does not contain it. node_freq_dict: Dictionary of occurrences of every eligible token within every context the target occurs in. edge_freq_dict: Dictionary of occurrences of every eligible tuple of tokens within every context the target occurs in. Returns: Updated versions of the input node dict and input edge dict. """ spaced_target_string = target_string.replace('_', ' ') stopword_list = config.stop_words allowed_tag_list = config.allowed_tags min_context_size = config.min_context_size try: for context in context_list: context = context.lower() if spaced_target_string in context: # Pre-select lines greedy. token_set = set() # Allow target to be treated as single entity. context = context.replace(spaced_target_string, target_string) processed_context = nlp(context) if target_string in [token.text for token in processed_context]: for token in processed_context: # Do not add target word to nodes. if token.text == target_string: pass # Do not add stop words to nodes. elif token.is_stop or token.text in stopword_list: pass # Add only tokens with allowed tags to nodes. elif token.tag_ in allowed_tag_list: if config.lemma == True: token_set.add(token.lemma_) else: token_set.add(token.text) context_size = len(token_set) if context_size >= min_context_size: for token in token_set: if token in node_freq_dict: node_freq_dict[token] += 1 else: node_freq_dict[token] = 1 #set of possible edges for edge in {(x,y) for x in token_set for y in token_set if x < y}: if edge in edge_freq_dict: edge_freq_dict[edge] += 1 else: edge_freq_dict[edge] = 1 # If file is corrupted (can't always be catched with if-else), ignore file. except UnicodeDecodeError: pass return node_freq_dict, edge_freq_dict def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph: """Builds undirected weighted graph from dictionaries. Creates graph and appends every edge and node in the parameter dictionaries, given they occur frequently enough. For every edge a weight is calculated. Args: node_freq_dict: Dictionary of occurrences of every eligible token within every context the target occurs in. edge_freq_dict: Dictionary of occurrences of every eligible tuple of tokens within every context the target occurs in. Returns: Filtered undirected dice weighted small word cooccurrence graph for a given target entity. """ min_node_freq = config.min_node_freq min_edge_freq = config.min_edge_freq max_weight = config.max_weight cooccurrence_graph = nx.Graph() for node, frequency in node_freq_dict.items(): if frequency >= min_node_freq: cooccurrence_graph.add_node(node) for node_tuple, frequency in edge_freq_dict.items(): if frequency < min_edge_freq: continue elif node_tuple[0] not in cooccurrence_graph.nodes: continue elif node_tuple[1] not in cooccurrence_graph.nodes: continue else: cooccurrence_frequency = edge_freq_dict[node_tuple] node0_frequency = node_freq_dict[node_tuple[0]] node1_frequency = node_freq_dict[node_tuple[1]] prob_0 = cooccurrence_frequency / node0_frequency prob_1 = cooccurrence_frequency / node1_frequency best_weight = 1 - max(prob_0, prob_1) #dice_weight = 1 - ((prob_0 + prob_1) / 2) if best_weight <= max_weight: cooccurrence_graph.add_edge(*node_tuple, weight=best_weight) else: pass return cooccurrence_graph def root_hubs(graph: nx.Graph, edge_freq_dict: dict) -> list: """Identifies senses (root hubs) by choosing nodes with high degrees Selects root hubs according to the algorithm in Véronis (2004). Nodes with high degree and neighbors with low weights (high cooccurrence) are chosen until there are no more viable candidates. A root hub candidate is every node that is not already a hub and is not a neighbor of one. Args: graph: Weighted undirected graph. edge_freq_dict: Dictionary of weights for every tuple in our graph. Returns: List of root hubs, i.e. strings that are selected using the algorithm explained above. """ min_neighbors = config.min_neighbors threshold = config.threshold # Allow operations on graph without altering original one. graph_copy = deepcopy(graph) # Sort according to degree (number of neighbors). candidate_list = sorted(graph_copy.nodes, key=lambda node: graph_copy.degree[node], reverse=True) hub_list = list() # While there are still candidates, search for root hubs. while candidate_list: candidate = candidate_list[0] #best hub candidate if graph_copy.degree[candidate] >= min_neighbors: by_frequency = lambda node: edge_freq_dict[candidate,node] \ if candidate < node \ else edge_freq_dict[node,candidate] most_frequent_neighbor_list = sorted(graph_copy.adj[candidate], key=by_frequency, reverse=True) [:min_neighbors] # If the mean weight of the most frequent neighbors cooccur # frequently enough with candidate, the candidate is approved. if np.mean([graph_copy.edges[candidate,node]['weight'] for node in most_frequent_neighbor_list]) < threshold: # Add candidate as root hub. hub_list.append(candidate) # Remove neighbors of new hub as hub candidates. for neighbor in deepcopy(graph_copy).adj[candidate]: graph_copy.remove_node(neighbor) # Remove hub candidate. graph_copy.remove_node(candidate) # Reorder potential hubs after deletions. candidate_list = sorted(graph_copy.nodes, key=lambda node: graph_copy.degree[node], reverse=True) else: return hub_list return hub_list def components(graph: nx.Graph, root_hub_list: list, target_string: str) -> nx.Graph: """Builds minimum spanning tree from graph and removes singletons. Applies components algorithm from Véronis (2004) and removes singletons. Args: graph: Undirected weighted graph. root_hub_list: List of strings of root hubs of graph. target_string: Root of minimum spanning tree. Returns: Minimum spanning tree with target as root and root hubs as direct children. Singletons removed. """ graph_copy = deepcopy(graph) graph_copy.add_node(target_string) for root_hub in root_hub_list: graph_copy.add_edge(target_string,root_hub,weight=0) minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy) # Remove singletons, deepcopy for iteration while being altered. for node in deepcopy(minimum_spanning_tree).nodes: if len(minimum_spanning_tree.adj[node]) == 0: minimum_spanning_tree.remove_node(node) return minimum_spanning_tree def score(graph: nx.Graph, component: str, root_hub_list: list) -> np.array: """Calculate score for a given component in a minimum spanning tree. First the correct root for the component is chosen. If no root hub is suitable, an empty array is returned. A score is calculated for the distance of the component and its root and returned as part of an array filled with zeroes. Args: graph: Minimum spanning tree. component: Node (string) from which the distances are to be calculated. root_hub_list: List of strings of root hubs (senses) of original graph. Returns: Array with one score for the correct root hub and filled with zeroes. """ root_hub_count = len(root_hub_list) #Initialise score array. score_array = np.zeros(root_hub_count) # Find root of component. distance_list = list() for root_hub in root_hub_list: if nx.has_path(graph, component, root_hub): distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub)))) else: distance_list.append(0) if sum(distance_list) == 0: return score_array root_idx = np.argmax(distance_list) root = root_hub_list[root_idx] shortest_path = nx.shortest_path(graph, component, root, 'weight') total_weight = 0 # Add weights of every sub-path. for i in range(1, len(shortest_path)): sub_from, sub_to = shortest_path[i-1], shortest_path[i] total_weight += graph[sub_from][sub_to]['weight'] score_array = np.zeros(root_hub_count) score_array[root_idx] = 1/(1+total_weight) return score_array def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): """Induces word senses for a given topic from corpus. Counts frequencies from corpus and search result list, builds graph from these counts (with some filters). Root hubs (senses) are collected from this graph. Args: topic_name: Target string. result_list: List of search result (context) strings. Returns: A cooccurrence graph, a list of root hub strings (senses) and dictionary of various statistics. """ stat_dict = dict() if topic_name in [output_file_name.replace('.absinth', '') for output_file_name in os.listdir(config.output)]: return None else: stat_dict['target'] = topic_name #in topics longer than two words, the leading 'the' can generally be removed without changing the sense if topic_name[:4] == 'the_' and topic_name.count('_') > 1: target_string = topic_name[4:] else: target_string = topic_name print('[a]', 'Counting nodes and edges.\t('+topic_name+')') node_freq_dict, edge_freq_dict = frequencies(target_string, result_list) #builds graph from these dictionaries, also applies multiple filters print('[a]', 'Building graph.\t('+topic_name+')') graph = build_graph(node_freq_dict, edge_freq_dict) stat_dict['node count'] = len(graph.nodes) stat_dict['edge count'] = len(graph.edges) #finds root hubs (senses) within the graph + more filters for these print('[a]', 'Collecting root hubs.\t('+topic_name+')') root_hub_list = root_hubs(graph, edge_freq_dict) #adds sense inventory to buffer with some common neighbors for context stat_dict['hubs'] = dict() for root_hub in root_hub_list: by_frequency = lambda node: edge_freq_dict[root_hub,node] \ if root_hub < node \ else edge_freq_dict[node, root_hub] most_frequent_neighbor_list = sorted(graph.adj[root_hub], key=by_frequency, reverse=True) stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6] return graph, root_hub_list, stat_dict def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: """Colours graph accoring to root hubs. Evolving network that colours neighboring nodes iterative. See sentiment propagation. Args: graph: Weighted undirected graph. root_hub_list: List of senses. Returns: Coloured graph. """ for node in graph.nodes: graph.node[node]['dist'] = [0] * len(root_hub_list) if node in root_hub_list: root_idx = root_hub_list.index(node) graph.node[node]['sense'] = root_idx graph.node[node]['dist'][root_idx] = 1 else: graph.node[node]['sense'] = None max_iteration_count = config.max_colour_iteration_count iteration_count = 0 stable = False while stable == False and iteration_count <= max_iteration_count: graph_copy = deepcopy(graph) iteration_count += 1 stable = True for node in graph.nodes: neighbor_weight_list = [0] * len(root_hub_list) for neighbor in graph_copy[node]: if graph_copy.node[neighbor]['sense'] == None: pass else: neighbor_weight_list[graph_copy.node[neighbor]['sense']] \ += 1 - graph_copy[node][neighbor]['weight'] if any(neighbor_weight_list): graph.node[node]['dist'] = np.mean([graph.node[node]['dist'], neighbor_weight_list], axis=0) old_colour = graph_copy.node[node]['sense'] new_colour = np.argmax(graph.node[node]['dist']) if old_colour != new_colour: stable = False graph.node[node]['sense'] = new_colour else: pass else: pass return graph def disambiguate_colour(graph: nx.Graph, root_hub_list: list, context_list: list) -> dict: """Clusters senses to root hubs using a coloured graph. This algorithm colours the graph using evolutionary graph theory and calculates scores for each root hub given a context based on this graph. Args: graph: Undirected weighted graph. root_hub_list: List of root hubs (senses). context_list: List of search result strings to be clustered. Returns: A dictionary with root hub IDs as keys and context indices as values. """ coloured_graph = colour_graph(graph, root_hub_list) mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)} if len(root_hub_list) == 0: mapping_dict = {0:[i for i in range(1, len(context_list)+1)]} return mapping_dict context_id = 0 for context in context_list: context_id += 1 score = [0]*len(root_hub_list) parsed_context = nlp(context) for token in parsed_context: if config.lemma == True: text = token.lemma_ else: text = token.text if text in coloured_graph.nodes: text_colour_dist = coloured_graph.node[text]['dist'] if not any(text_colour_dist): pass else: for root_hub in root_hub_list: root_hub_idx = root_hub_list.index(root_hub) if nx.has_path(coloured_graph , text, root_hub): shortest_path = nx.shortest_path(coloured_graph , text, root_hub, 'weight') total_weight = 0 # Add weights of every sub-path. for i in range(1, len(shortest_path)): sub_from, sub_to = shortest_path[i-1], shortest_path[i] total_weight += \ coloured_graph[sub_from][sub_to]['weight'] score[root_hub_idx] += (1/(1+total_weight)) \ * colour_graph.node[text]['dist'][root_hub_idx] else: pass else: pass if any(score): mapping_dict[np.argmax(score)+1].append(context_id) else: pass return mapping_dict def disambiguate_mst(graph: nx.Graph, root_hub_list: list, context_list: list, topic_name: str) -> dict: """Matches contexts to senses. Builds minimum spanning tree from graph. Adds up scores based on tree node distance for each token in a context string and matches the context to the root hub with the highest score. Args: graph: Weighted undirected graph. root_hub_list: List of strings of root hubs (senses). context_list: List of sentence strings that are to be clustered. topic_name: String of target word, also root of MST. Returns: Dictionary of root hubs (senses) as keys and context IDs as values. """ #performs minimum_spanning_tree algorithm on graph minimum_spanning_tree = components(graph, root_hub_list, topic_name) spaced_topic_name = topic_name.replace('_', ' ') context_list = [context.lower().strip().replace(spaced_topic_name, '') for context in context_list] score_dict = dict() #memoisation for scores mapping_dict = {topic:[] for topic in range(1,len(root_hub_list)+1)} #if no sense is found for a target word, we should assume that there only is one sense if len(root_hub_list) == 0: mapping_dict = {0:[i for i in range(1, len(context_list)+1)]} return mapping_dict idx = 0 for context in context_list: idx += 1 #index based on position in list processed_context = nlp(context) if config.lemma == True: token_list = [token.lemma_ for token in processed_context] #tokens else: token_list = [token.text for token in processed_context] #tokens score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense for token in token_list: if token in minimum_spanning_tree.nodes: #if word wasn't filtered out if token in score_dict: #memoisation new_scores = score_dict[token] else: new_score = score(minimum_spanning_tree, token, root_hub_list) score_dict[token] = new_score #memoisation score_array += new_score else: pass # If disambiguator does not detect a sense, return singleton. if not any(score_array): pass else: # Apply sense with the highest score to context max_score = np.max(score_array) argmax_score = np.argmax(score_array) # Clusters begin at 1 mapping_dict[argmax_score + 1].append(idx) return mapping_dict def main(topic_id: int, topic_name: str, result_dict: dict) -> None: """Calls induction and disambiguation functions, performs main task. The task is to both induce senses and match search results to them. This function calls in much the same way induce() and disambiguate_mst() to perform these sub tasks. The result is then written to the output directory specified in config.py. Args: topic_id: Index of topic in topics.txt. topic_name: Target string. result_dict: Dictionary with topic_id as key and list of search queries (from results.txt) as values. """ print('[a]', 'Inducing word senses for {}.'.format(topic_name)) graph, root_hub_list, stat_dict = induce(topic_name,result_dict[topic_id]) colour_rank = config.colour_rank mst_rank = config.mst_rank #Merges Mappings according to pipeline mapping_dict = dict() #matches senses to clusters print('[a]', 'Disambiguating results.\t('+topic_name+')') if colour_rank != 0: print('[a]', 'Colouring graph.\t('+topic_name+')') mapping_dict[colour_rank] = disambiguate_colour(graph, root_hub_list, result_dict[topic_id]) if mst_rank != 0: print('[a]', 'Building minimum spanning tree.\t('+topic_name+')') mapping_dict[mst_rank] = disambiguate_mst(graph, root_hub_list, result_dict[topic_id], topic_name) mapping_list = [item[1] for item in sorted(mapping_dict.items())] mapping_count = len(mapping_list) merged_mapping_dict = mapping_list[0] merged_entry_count = 0 for i in range(1,mapping_count): result_list = [result for result_list in merged_mapping_dict.values() for result in result_list] #individual mappings relation_list = [(topic,result) for topic in mapping_list[i].keys() for result in mapping_list[i][topic]] for topic, result in relation_list: if result not in result_list: merged_entry_count += 1 if topic in merged_mapping_dict: merged_mapping_dict[topic].append(result) else: merged_mapping_dict[topic] = [result] stat_dict['merge_gain'] = merged_entry_count #collect statistics from result. cluster_count = 0 cluster_length_list = list() for cluster,result_list in merged_mapping_dict.items(): cluster_length = len(result_list) if cluster_length != 0: cluster_count += 1 cluster_length_list.append(cluster_length) stat_dict['mean_cluster_length'] = np.mean(cluster_length_list) stat_dict['cluster_count'] = cluster_count print('[a]', 'Writing to file.\t('+topic_name+')') output_path = config.output output_file_name = output_path+topic_name+'.absinth' with open(output_file_name, 'w') as output_file: output_file.write('subTopicID\tresultID\n') for cluster_id,result_list in merged_mapping_dict.items(): for result_id in result_list: output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id, topic_id, result_id) output_file.write(output_line) pprint.pprint(stat_dict) if __name__ == '__main__': """Check for modifiers and call main(). Only called when absinth.py is started manually. Checks for various modifiers, i.e. test environment and number of processes to run simultaneously. """ # If absinth.py is run in test environment. if '-t' in sys.argv: data_path = config.test else: data_path = config.dataset result_dict, topic_dict = read_dataset(data_path) # Enables manual setting of process count. if '-p' in sys.argv: process_count = int(sys.argv[sys.argv.index('-p') + 1]) else: process_count = 1 with Pool(process_count) as pool: parameter_list = [(topic_id, topic_name, result_dict) for topic_id,topic_name in topic_dict.items()] pool.starmap(main, sorted(parameter_list)) #determineate function