From a9b0962ebbb4688af93e6d400ff0420db96fe09c Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Mon, 19 Mar 2018 19:02:09 +0100 Subject: [PATCH] Further commenting. --- src/absinth.py | 281 +++++++++++++++++++++++++++++-------------------- 1 file changed, 164 insertions(+), 117 deletions(-) diff --git a/src/absinth.py b/src/absinth.py index a082b43..9a33af7 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -1,23 +1,56 @@ #!/usr/bin/env python3 import sys -import matplotlib -matplotlib.use("Agg") print('[A] Loading ' + sys.argv[0] + '.\n') -import os # for reading files +import config import networkx as nx # for visualisation -from copy import deepcopy -from nltk.corpus import stopwords -import numpy as np # for calculations +import numpy as np +import os # for reading files +import pprint +import random import re import spacy # for nlp from multiprocessing import Pool -import random -import matplotlib.pyplot as plt -import config +from nltk.corpus import stopwords +from copy import deepcopy nlp = spacy.load('en') # standard english nlp + +def read_dataset(data_path): + """Collects topics.txt and results.txt. + + + """ + + results = dict() + + with open(data_path+'results.txt', 'r') as results_file: + + for line in results_file.readlines()[1:]: + + l = line.split('\t') + id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index + + if id1 not in results: + results[id1]=list() + + results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored + + + # topics.txt is a list of target words + topics = dict() + + with open(data_path+'topics.txt', 'r') as topics_file: + + for line in topics_file.readlines()[1:]: + + l = line.split('\t') + topics[l[0]] = l[1].strip() + + return results, topics + + def frequencies(target_string, search_result_list): """Counts occurrences of nodes and cooccurrences. @@ -168,7 +201,10 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): # Add only tokens with allowed tags to nodes. elif token.tag_ in allowed_tag_list: - token_set.add(token.lemma_) + if config.lemma == True: + token_set.add(token.lemma_) + else: + token_set.add(token.text) context_size = len(token_set) @@ -416,78 +452,96 @@ def score(graph, component, root_hub_list): def induce(topic_name, result_list): - """ - + """Induces word senses for a given topic from corpus. + + Counts frequencies from corpus and search result list, builds graph from + these counts (with some filters). Root hubs (senses) are collected from + this graph. + Args: + topic_name: Target string. + result_list: List of search result (context) strings. + + Returns: + root_hub_list: List of root hub strings (senses). + stat dict: Various statistics. """ - statistics = dict() - - #removes trailing new_lines - old_target_string = topic_name.strip() #original target + stat_dict = dict() - if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: + if topic_name in [output_file_name.replace('.absinth', '') + for output_file_name in os.listdir(config.output)]: + return None - statistics['target'] = old_target_string + else: - #in topics longer than two words, the leading 'the' can generally be removed without changing the sense - if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2: + stat_dict['target'] = topic_name - target_string = old_target_string[4:] + #in topics longer than two words, the leading 'the' can generally be removed without changing the sense + if topic_name[:4] == 'the_' and topic_name.count('_') > 1: + + target_string = topic_name[4:] + + else: + + target_string = topic_name - else: + print('[a]', 'Counting nodes and edges.\t('+topic_name+')') + node_freq_dict, edge_freq_dict = frequencies(target_string, result_list) - target_string = old_target_string - - #counts occurences of single words, as well as cooccurrences, saves it in dictionary - print('[a]', 'Counting nodes and edges.\t('+old_target_string+')') - node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id]) - - #builds graph from these dictionaries, also applies multiple filters - print('[a]', 'Building graph.\t('+old_target_string+')') - G = build_graph(node_freq_dict, edge_freq_dict) - - statistics['node count'] = len(G.nodes) - statistics['edge count'] = len(G.edges) + #builds graph from these dictionaries, also applies multiple filters + print('[a]', 'Building graph.\t('+topic_name+')') + graph = build_graph(node_freq_dict, edge_freq_dict) + + stat_dict['node count'] = len(graph.nodes) + stat_dict['edge count'] = len(graph.edges) - #finds root hubs (senses) within the graph + more filters for these - print('[a]', 'Collecting root hubs.\t('+old_target_string+')') - H = root_hubs(G, edge_freq_dict) - - #adds sense inventory to buffer with some common neighbors for context - statistics['hubs'] = dict() - for h in H: - mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6] - statistics['hubs'][h] = mfn - - #performs minimum_spanning_tree algorithm on graph - print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')') - T = components(G, H, target_string) + #finds root hubs (senses) within the graph + more filters for these + print('[a]', 'Collecting root hubs.\t('+topic_name+')') + root_hub_list = root_hubs(graph, edge_freq_dict) + + #adds sense inventory to buffer with some common neighbors for context + stat_dict['hubs'] = dict() + + for root_hub in root_hub_list: + + by_frequency = lambda node: edge_freq_dict[root_hub,node] \ + if root_hub < node \ + else edge_freq_dict[node, root_hub] + + most_frequent_neighbor_list = sorted(graph.adj[root_hub], + key=by_frequency, reverse=True) + + stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6] - return T, H, statistics + return graph, root_hub_list, stat_dict -def disambiguate(minimum_spanning_tree, root_hub_list, - context_list, target_string): +def disambiguate(graph, root_hub_list, context_list, topic_name): """Matches contexts to senses. - Adds up scores for each token in a context string and matches the context - to the root hub with the highest score. + Builds minimum spanning tree from graph. + Adds up scores based on tree node distance for each token in a context + string and matches the context to the root hub with the highest score. Args: - minimum_spanning_tree: Minimum spanning tree with target as root. + graph: Weighted undirected graph. root_hub_list: List of strings of root hubs (senses). context_list: List of sentence strings that are to be clustered. - target_string: String of target word, also root of MST. + topic_name: String of target word, also root of MST. Returns: mapping_dict: Dictionary of root hubs (senses) as keys and context ids as values. """ - target_string = target_string.replace('_', ' ') - context_list = [context.lower().strip().replace(target_string, '') + #performs minimum_spanning_tree algorithm on graph + print('[a]', 'Building minimum spanning tree.\t('+topic_name+')') + minimum_spanning_tree = components(graph, root_hub_list, topic_name) + + spaced_topic_name = topic_name.replace('_', ' ') + context_list = [context.lower().strip().replace(spaced_topic_name, '') for context in context_list] score_dict = dict() #memoisation for scores @@ -505,23 +559,27 @@ def disambiguate(minimum_spanning_tree, root_hub_list, idx += 1 #index based on position in list processed_context = nlp(context) - text_list = [token.text for token in processed_context] #tokens + + if config.lemma == True: + token_list = [token.lemma_ for token in processed_context] #tokens + else: + token_list = [token.text for token in processed_context] #tokens score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense - for text in text_list: + for token in token_list: - if text in minimum_spanning_tree.nodes: #if word wasn't filtered out + if token in minimum_spanning_tree.nodes: #if word wasn't filtered out - if text in score_dict: #memoisation + if token in score_dict: #memoisation - new_scores = score_dict[text] + new_scores = score_dict[token] else: new_score = score(minimum_spanning_tree, - text, root_hub_list) - score_dict[text] = new_score #memoisation + token, root_hub_list) + score_dict[token] = new_score #memoisation score_array += new_score @@ -546,77 +604,66 @@ def disambiguate(minimum_spanning_tree, root_hub_list, return mapping_dict -def main(topic_id, topic_name, result_list): - """ - +def main(topic_id, topic_name, result_dict): + """Calls induction and disambiguation functions, performs main task. + The task is to both induce senses and match search results to them. This + function calls in much the same way induce() and disambiguate() to perform + these sub tasks. The result is then written to the output directory + specified in config.py. + + Args: + topic_id: Index of topic in topics.txt. + topic_name: Target string. + result_dict: Dictionary with topic_id as key and list of search queries + (from results.txt) as values. + + Returns: + None """ print('[a]', 'Inducing word senses for {}.'.format(topic_name)) - T, H, statistics = induce(topic_name, result_list) + graph, root_hub_list, stat_dict = induce(topic_name, + result_dict[topic_id]) #matches senses to clusters - print('[a]', 'Disambiguating result_list.\t('+old_target_string+')') - D = disambiguate(T, H, result_list[topic_id], target_string) + print('[a]', 'Disambiguating result_list.\t('+topic_name+')') + mapping_dict = disambiguate(graph, root_hub_list, + result_dict[topic_id], topic_name) #collect statistics from result. cluster_count = 0 cluster_length_list = list() - for cluster,result_list in D.items(): + + for cluster,result_list in mapping_dict.items(): + cluster_length = len(result_list) + if cluster_length != 0: + cluster_count += 1 cluster_length_list.append(cluster_length) - statistics['mean_cluster_length'] = np.mean(cluster_length_list) - statistics['cluster_count'] = cluster_count + + stat_dict['mean_cluster_length'] = np.mean(cluster_length_list) + stat_dict['cluster_count'] = cluster_count - #prints buffer - print('[a]', 'Writing to file.\t('+old_target_string+')') + print('[a]', 'Writing to file.\t('+topic_name+')') + output_path = config.output + output_file_name = output_path+topic_name+'.absinth' + + with open(output_file_name, 'w') as output_file: - f = open(output_path+old_target_string+'.absinth', 'w') - - f.write('subTopicID\tresultID\n') + output_file.write('subTopicID\tresultID\n') - #writes clustering to file - for cluster,result_list in D.items(): - for result in result_list: - f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n') - - f.close() - - -def read_dataset(data_path): - - # results.txt includes the queries for a given target word - results = dict() - - with open(data_path+'results.txt', 'r') as results_file: - - for line in results_file.readlines()[1:]: - - l = line.split('\t') - id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index - - if id1 not in results: - results[id1]=list() + for cluster_id,result_list in mapping_dict.items(): + for result_id in result_list: + output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id, + topic_id, result_id) + output_file.write(output_line) - results[id1].append(" ".join(l[2:])) # here I join title and snippet, the URL is ignored - - - # topics.txt is a list of target words - topics = dict() - - with open(data_path+'topics.txt', 'r') as topics_file: - - for line in topics_file.readlines()[1:]: - - l = line.split('\t') - topics[l[0]] = l[1] + pprint.pprint(stat_dict) - return results, topics - - if __name__ == '__main__': @@ -626,7 +673,7 @@ if __name__ == '__main__': else: data_path = config.dataset - results, topics = read_dataset(data_path) + result_dict, topic_dict = read_dataset(data_path) # Enables manual setting of process count. if '-p' in sys.argv: @@ -635,8 +682,8 @@ if __name__ == '__main__': process_count = 1 with Pool(process_count) as pool: - parameter_list = [(topic_id, topic_name, results) - for topic_id,topic_name in topics.items()] + parameter_list = [(topic_id, topic_name, result_dict) + for topic_id,topic_name in topic_dict.items()] pool.starmap(main, parameter_list) #for topic_id,topic_name in topics.items(): -- GitLab