diff --git a/src/abstinent.py b/src/abstinent.py new file mode 100644 index 0000000000000000000000000000000000000000..724eeec19e08e377650c61307f46a7ebb8f417ff --- /dev/null +++ b/src/abstinent.py @@ -0,0 +1,679 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Artificially Basic System Trying to Induce Numerous ENTities + +This module performs word sense induction for a given word on a corpus and +matches a list of contexts to each. The method to achieve this is just a +baseline, working with a random selection of senses and a simple Lesk +implementation. + +Example: + The function can be called with the following command: + + $ python3 abstinent.py + + The function can be called with a list of modifiers. + +Modifiers: + '-t': Runs abstinent.py on the trial path given in the config.py instead of + the data_path. + '-p n': Runs abstinent.py with n concurrent processes (standard: 1). + +.. _Association Based Semantic Induction Tools from Heidelberg + https://gitlab.cl.uni-heidelberg.de/zimmermann/absinth + +""" + +import sys +print('[a] Loading ' + sys.argv[0] + '.\n') +import config +import networkx as nx # for visualisation +import numpy as np +import os # for reading files +import pprint +import random +import re +import scipy.special +import spacy # for nlp +import time + +from copy import deepcopy +from multiprocessing import Pool +from scipy import stats + +random.seed(325) +nlp = spacy.load('en') # standard english nlp + + +def read_dataset(data_path: str) -> (dict, dict): + """Collects topics.txt and results.txt. + + Iterates over topics.txt and results.txt in the data path and converts them + to dictionaries with the ID as key and the target word / title + snippet as + values. + + Args: + data_path: File path to directory containing topics.txt and results.txt. + + Returns: + One dictionary for each file. + """ + + results = dict() + + with open(data_path+'results.txt', 'r') as results_file: + + for line in results_file.readlines()[1:]: + + l = line.split('\t') + id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index + + if id1 not in results: + results[id1]=list() + + results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored + + + # topics.txt is a list of target words + topics = dict() + + with open(data_path+'topics.txt', 'r') as topics_file: + + for line in topics_file.readlines()[1:]: + + l = line.split('\t') + topics[l[0]] = l[1].strip() + + return results, topics + + +def frequencies(target_string: str, search_result_list: list) -> (dict, dict): + """Counts occurrences of nodes and cooccurrences. + + Iterates over the corpus (and snippets provided with the task) line by line + and counts every token and tuple of tokens within a line (context). These + tokens is filtered by stop words, pos tags and context length. + + Args: + target_string: contexts are selected if they contain this string. For + further processing this string is removed from the contexts. + search_result_list: List of titles and snippets provided with the task. + + Returns: + Dictionary of occurrences of every eligible token within every context + the target occurs in, dictionary of occurrences of every eligible + tuple of tokens within every context the target occurs in. + + """ + + corpus_path = config.corpus + max_node_count = config.max_nodes + max_edge_count = config.max_edges + + bracketed_target_string = '('+target_string+')' + + # Remove unnecessary tokens from snippets. + _search_result_list = list() + for r in search_result_list: + r = r.replace('<b>', '') + r = r.replace('</b>', '') + r = r.replace(r'\\', '') + r = r.strip() + _search_result_list.append(r) + + # Initialise frequencies with counts from results. + node_freq_dict, edge_freq_dict = process_file(_search_result_list, + target_string, + dict(), + dict()) + + corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)] + corpus_size = len(corpus_file_path_list) + + processed_file_count = 0 + for corpus_file_path in corpus_file_path_list: + + node_count = len(node_freq_dict) + edge_count = len(edge_freq_dict) + + # Print update after every 11th of the corpus is parsed. + if processed_file_count % int(corpus_size/11) == 0: + + file_ratio = processed_file_count / corpus_size + max_node_ratio = node_count / max_node_count + max_edge_ratio = edge_count / max_edge_count + + ratios = [file_ratio, max_node_ratio, max_edge_ratio] + + # Use ratio closest to 100%. + highest_ratio = int((max(ratios))*100) + + print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio, + node_count, + edge_count, + bracketed_target_string)) + + if node_count > max_node_count: + print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, + edge_count, + bracketed_target_string)) + return node_freq_dict, edge_freq_dict + + if edge_count > max_edge_count: + print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, + edge_count, + bracketed_target_string)) + return node_freq_dict, edge_freq_dict + + with open(corpus_file_path, 'r') as corpus_file: + + node_freq_dict, edge_freq_dict = process_file(corpus_file, + target_string, + node_freq_dict, + edge_freq_dict) + + processed_file_count += 1 + + print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, + edge_count, + bracketed_target_string)) + + return node_freq_dict, edge_freq_dict + + +def process_file(context_list: list, target_string: str, + node_freq_dict: dict, edge_freq_dict: dict) -> (dict, dict): + """Updates the counts of nodes and edges for a given document and target. + + Ammends the input dictionaries with counts from each context withing the + list of contexts. Furthermore filters out small contexts and tokens from + the stopword list or with wrong pos tags. + + Args: + context_list: List of contexts (lines, paragraphs) that are to be + considered for updating the counting dictionaries. + target_string: Target string for filtering out every context that does + not contain it. + node_freq_dict: Dictionary of occurrences of every eligible token + within every context the target occurs in. + edge_freq_dict: Dictionary of occurrences of every eligible tuple of + tokens within every context the target occurs in. + + Returns: + Updated versions of the input node dict and input edge dict. + """ + + spaced_target_string = target_string.replace('_', ' ') + + stopword_list = config.stop_words + allowed_tag_list = config.allowed_tags + min_context_size = config.min_context_size + max_context_size = config.max_context_size + + try: + + for context in context_list: + + context = context.lower() + if spaced_target_string in context: # Pre-select lines greedy. + + token_set = set() + + # Allow target to be treated as single entity. + context = context.replace(spaced_target_string, target_string) + processed_context = nlp(context) + + if target_string in [token.text for token in processed_context]: + + for token in processed_context: + + # Do not add target word to nodes. + if token.text == target_string: + pass + + # Do not add stop words to nodes. + elif token.is_stop or token.text in stopword_list: + pass + + # Add only tokens with allowed tags to nodes. + elif token.tag_ in allowed_tag_list: + if config.lemma == True: + token_set.add(token.lemma_) + else: + token_set.add(token.text) + + context_size = len(token_set) + + if context_size >= min_context_size and context_size <= max_context_size: + for token in token_set: + + if token in node_freq_dict: + node_freq_dict[token] += 1 + else: + node_freq_dict[token] = 1 + + #set of possible edges + for edge in {(x,y) for x in token_set for y in token_set if x < y}: + + if edge in edge_freq_dict: + edge_freq_dict[edge] += 1 + else: + edge_freq_dict[edge] = 1 + + # If file is corrupted (can't always be catched with if-else), ignore file. + except UnicodeDecodeError: + + pass + + return node_freq_dict, edge_freq_dict + + +def build_graph(node_freq_dict: dict, edge_freq_dict: dict) -> nx.Graph: + """Builds undirected weighted graph from dictionaries. + + Creates graph and appends every edge and node in the parameter dictionaries, + given they occur frequently enough. For every edge a weight is calculated. + + Args: + node_freq_dict: Dictionary of occurrences of every eligible token + within every context the target occurs in. + edge_freq_dict: Dictionary of occurrences of every eligible tuple of + tokens within every context the target occurs in. + + Returns: + Filtered undirected dice weighted small word cooccurrence graph for a + given target entity. + """ + + min_node_freq = config.min_node_freq + min_edge_freq = config.min_edge_freq + max_weight = config.max_weight + + cooccurrence_graph = nx.Graph() + + for node, frequency in node_freq_dict.items(): + + if frequency >= min_node_freq: + cooccurrence_graph.add_node(node) + + for node_tuple, frequency in edge_freq_dict.items(): + + if frequency < min_edge_freq: + + continue + + elif node_tuple[0] not in cooccurrence_graph.nodes: + + continue + + elif node_tuple[1] not in cooccurrence_graph.nodes: + + continue + + else: + + cooccurrence_frequency = edge_freq_dict[node_tuple] + node0_frequency = node_freq_dict[node_tuple[0]] + node1_frequency = node_freq_dict[node_tuple[1]] + + prob_0 = cooccurrence_frequency / node0_frequency + prob_1 = cooccurrence_frequency / node1_frequency + + best_weight = 1 - max(prob_0, prob_1) + #dice_weight = 1 - ((prob_0 + prob_1) / 2) + + if best_weight <= max_weight: + + cooccurrence_graph.add_edge(*node_tuple, weight=best_weight) + + else: + + pass + + # Remove singletons, deepcopy for iteration while being altered. + for node in deepcopy(cooccurrence_graph).nodes: + if len(cooccurrence_graph.adj[node]) == 0: + cooccurrence_graph.remove_node(node) + + return cooccurrence_graph + + +def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): + """ + Use n random nodes as root hubs. + """ + + stat_dict = dict() + + stat_dict['target'] = topic_name + + print('[a]', 'Counting nodes and edges.\t('+topic_name+')') + node_freq_dict, edge_freq_dict = frequencies(topic_name, result_list) + + #builds graph from these dictionaries, also applies multiple filters + print('[a]', 'Building graph.\t('+topic_name+')') + graph = build_graph(node_freq_dict, edge_freq_dict) + + for string in topic_name.split('_'): + if string in graph.nodes: + graph.remove_node(string) + + stat_dict['nodes'] = len(graph.nodes) + stat_dict['edges'] = len(graph.edges) + + #finds root hubs (senses) within the graph + more filters for these + print('[a]', 'Collecting root hubs.\t('+topic_name+')') + + sense_count = min(config.sense_count, len(graph.nodes)) + + root_hub_list = sorted([(value,key) for key,value in node_freq_dict.items() if key in graph.nodes], + reverse=True)[:sense_count] + root_hub_list = [hub[1] for hub in root_hub_list] + + #adds sense inventory to buffer with some common neighbors for context + stat_dict['hubs'] = dict() + + for root_hub in root_hub_list: + + by_frequency = lambda node: edge_freq_dict[root_hub,node] \ + if root_hub < node \ + else edge_freq_dict[node, root_hub] + + most_frequent_neighbor_list = sorted(graph.adj[root_hub], + key=by_frequency, reverse=True) + + stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6] + + return graph, root_hub_list, stat_dict + + +def bag_of_senses(graph: nx.Graph, root_hub_list:list) -> dict: + """ + Matches each node to the root hub it is closest to. + """ + + root_hub_count = len(root_hub_list) + + bag = {i:[] for i in range(root_hub_count)} + + for node in graph.nodes: + + score = [0] * root_hub_count + + for i in range(root_hub_count): + + root = root_hub_list[i] + + if nx.has_path(graph, node, root): + path = nx.shortest_path(graph, node, root, 'weight') + score[i] = 1/(1+len(path)) + + bag[np.argmax(score)].append(node) + + return bag + + +def disambiguate(bag_of_senses: dict, context_list: list) -> dict: + """ + Lesk. + """ + + context_idx = 0 + + mapping_dict = dict() + + for context in context_list: + + context_idx += 1 + score = [0] * len(bag_of_senses) + + processed_context = nlp(context) + + text_list = [token.text for token in processed_context] + + for text in text_list: + + for sense, words in bag_of_senses.items(): + + if text in words: + + score[sense] += 1 + + sense = np.argmax(score) + + if sense in mapping_dict: + mapping_dict[sense].append(context_idx) + else: + mapping_dict[sense] = [context_idx] + + return mapping_dict + + +def print_stats(stat_dict: dict) -> None: + """Prints various statistics and logs them to file. + + Args: + stat_dict: Dictionary with various statistics. + + """ + + stat_string = [] + + ts = time.gmtime() + + key_list= ['target','nodes','edges','L','C','L_rand','C_rand','clusters','a_mean_size','h_mean_size','pipe_gain'] + + stat_string.append('Topic: {}.'.format(stat_dict['target'])) + stat_string.append('Processed {} at {}.'.format(time.strftime("%Y-%m-%d", ts),time.strftime("%H:%M:%S", ts))) + stat_string.append('Nodes: {}\tEdges: {}.'.format(stat_dict['nodes'],stat_dict['edges'])) + stat_string.append('Characteristic path length: {}.'.format(stat_dict['L'])) + stat_string.append('Global clustering coefficient: {}.'.format(stat_dict['C'])) + stat_string.append('Mean cluster length (arithmetic): {}.'.format(stat_dict['a_mean_size'])) + stat_string.append('Mean cluster length (harmonic): {}.'.format(stat_dict['h_mean_size'])) + stat_string.append('Number of clusters: {}.'.format(stat_dict['clusters'])) + stat_string.append('Tuples gained through merging: {}.'.format(stat_dict['pipe_gain'])) + stat_string.append('Sense inventory:') + for hub in stat_dict['hubs'].keys(): + stat_string.append(' -> {}: {}.'.format(hub, ", ".join(stat_dict['hubs'][hub]))) + + print('\n[A] '+'\n[A] '.join(stat_string)+'\n') + + with open('../baseline/statistics.txt', 'a') as stat_file: + + stat_file.write('\n '.join(stat_string)+'\n\n') + + write_header = not os.path.exists('.statistics.tsv') + + with open('../baseline/.statistics.tsv', 'a') as stat_file: + + if write_header: + + stat_file.write('\t'.join(key_list)+'\n') + + stat_file.write('\t'.join([str(stat_dict[key]) for key in key_list])+'\n') + + + + + + +def global_clustering_coefficient(graph: nx.Graph) -> float: + """Calculates global clustering coefficient from graph. + + Iterates over every node and calculates the global coefficient as a mean + of every local clustering coefficient. + + Args: + graph: Undirected graph. + + Returns: + Global coefficient. + """ + + local_coefficient_list = list() + + for node in graph.nodes: + + neighbor_list = graph.adj[node] + + neighbor_edge_list = [(x,y) for x in neighbor_list + for y in neighbor_list if x<y] + + if len(neighbor_edge_list) == 0: + + local_coefficient_list.append(0) + + else: + + edge_count = 0 + for x,y in neighbor_edge_list: + if graph.has_edge(x,y): + edge_count += 1 + + local_coefficient_list.append(edge_count/len(neighbor_edge_list)) + + return np.mean(local_coefficient_list) + + +def characteristic_path_length(graph: nx.Graph) -> float: + """Calculates characteristic path length from graph. + + Iterates over every node tuple and calculates the shortest path between them. + The average path length is returned. Tuples without path are ignored. + + Args: + graph: Undirected graph. + + Returns: + Global coefficient. + """ + + path_length_list = list() + + path_list = [(x,y) for x in graph.nodes for y in graph.nodes if x<y] + + for path in path_list: + + if nx.has_path(graph,*path): + + shortest_path = nx.shortest_path(graph,*path) + + path_length_list.append(len(shortest_path)) + + return np.mean(path_length_list) + + +def main(topic_id: int, topic_name: str, result_dict: dict) -> None: + """Calls induction and disambiguation functions, performs main task. + + The task is to both induce senses and match search results to them. This + function calls in much the same way induce() and disambiguate_mst() to + perform these sub tasks. The result is then written to the output directory + specified in config.py. + + Args: + topic_id: Index of topic in topics.txt. + topic_name: Target string. + result_dict: Dictionary with topic_id as key and list of search queries + (from results.txt) as values. + + """ + + if topic_name in [output_file_name.replace('.absinth', '') + for output_file_name in os.listdir(config.base_out)]: + return None + + else: + + print('[a]', 'Inducing word senses for {}.'.format(topic_name)) + + graph, root_hub_list, stat_dict = induce(topic_name, result_dict[topic_id]) + + stat_dict['L'] = characteristic_path_length(graph) + stat_dict['C'] = global_clustering_coefficient(graph) + + edge_count = len(graph.edges) + node_count = len(graph.nodes) + mean_degree = edge_count/node_count + + stat_dict['L_rand'] = np.log(node_count)/np.log(mean_degree) + stat_dict['C_rand'] = 2 * mean_degree/node_count + + print('[a]', 'Disambiguating results.\t('+topic_name+')') + + bag = bag_of_senses(graph, root_hub_list) + mapping_dict = disambiguate(bag, result_dict[topic_id]) + + mapping_list = [item[1] for item in sorted(mapping_dict.items())] + mapping_count = len(mapping_list) + + stat_dict['pipe_gain'] = None + + #collect statistics from result. + cluster_count = 0 + cluster_length_list = list() + + for cluster,result_list in mapping_dict.items(): + + cluster_length = len(result_list) + + if cluster_length != 0: + + cluster_count += 1 + cluster_length_list.append(cluster_length) + + stat_dict['h_mean_size'] = stats.hmean(cluster_length_list) + stat_dict['a_mean_size'] = np.mean(cluster_length_list) + stat_dict['clusters'] = cluster_count + + print('[a]', 'Writing to file.\t('+topic_name+')') + + output_path = config.base_out + output_file_name = output_path+topic_name+'.absinth' + + with open(output_file_name, 'w') as output_file: + + output_file.write('subTopicID\tresultID\n') + + for cluster_id,result_list in mapping_dict.items(): + for result_id in result_list: + output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id, + topic_id, result_id) + output_file.write(output_line) + + print_stats(stat_dict) + + + + +if __name__ == '__main__': + """Check for modifiers and call main(). + + Only called when absinth.py is started manually. Checks for various + modifiers, i.e. test environment and number of processes to run + simultaneously. + """ + + # If absinth.py is run in test environment. + if '-t' in sys.argv: + data_path = config.test + else: + data_path = config.dataset + + result_dict, topic_dict = read_dataset(data_path) + + # Enables manual setting of process count. + if '-p' in sys.argv: + + process_count = int(sys.argv[sys.argv.index('-p') + 1]) + + with Pool(process_count) as pool: + + parameter_list = [(topic_id, topic_name, result_dict) + for topic_id,topic_name in topic_dict.items()] + pool.starmap(main, sorted(parameter_list)) #determineate function + + else: + + for topic_id, topic_name in sorted(topic_dict.items()): + main(topic_id, topic_name, result_dict)