From 657eb8e569d0c5b9d0005db574de7c2453cb5475 Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Thu, 15 Mar 2018 17:45:26 +0100 Subject: [PATCH] Started renaming and commenting code. --- src/absinth.py | 376 +++++++++++++++++++++++++++++++------------------ 1 file changed, 242 insertions(+), 134 deletions(-) diff --git a/src/absinth.py b/src/absinth.py index 4c1b106..7dcf530 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -1,168 +1,272 @@ +#!/usr/bin/env python3 + import sys +import matplotlib +matplotlib.use("Agg") print('[A] Loading ' + sys.argv[0] + '.\n') import os # for reading files import networkx as nx # for visualisation from copy import deepcopy from nltk.corpus import stopwords import numpy as np # for calculations -import config +import re import spacy # for nlp from multiprocessing import Pool import random import matplotlib.pyplot as plt +import config nlp = spacy.load('en') # standard english nlp -#counts occurences of nodes and cooccurrences -def frequencies(corpus_path, target, results): - - max_nodes = config.max_nodes - max_edges = config.max_edges +def frequencies(target_string, search_result_list): + """Counts occurrences of nodes and cooccurrences. - results = [r.replace('<b>', '').replace('</b>', '').replace(r'\\', '').strip() for r in results] - node_freq, edge_freq = process_file(results, target) #initialises frequencies with counts from results + Iterates over the corpus (and snippets provided with the task) line by line + and counts every token and tuple of tokens within a line (context). These + tokens is filtered by stop words, pos tags and context length. - files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files - - i = 0 #for update print statements - for f in files: + Args: + target_string: contexts are selected if they contain this string. For + further processing this string is removed from the contexts. + search_result_list: List of titles and snippets provided with the task. - if i % int(len(files)/11) == 0: #prints update after every 10th of the corpus is parsed + Returns: + node_freq_dict: Dictionary of occurrences of every eligible token + within every context the target occurs in. + edge_freq_dict: Dictionary of occurrences of every eligible tuple of + tokens within every context the target occurs in. + + """ + + corpus_path = config.corpus + max_node_count = config.max_nodes + max_edge_count = config.max_edges + + bracketed_target_string = '('+target_string+')' + + # Remove unnecessary tokens from snippets + _search_result_list = list() + for r in search_result_list: + r = r.replace('<b>', '') + r = r.replace('</b>', '') + r = r.replace(r'\\', '') + r = r.strip() + _search_result_list.append(r) + + #initialises frequencies with counts from results + node_freq_dict, edge_freq_dict = process_file(_search_result_list, + target_string, + dict(), + dict()) + + #names of corpus files + corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)] + corpus_size = len(corpus_file_path_list) + + processed_file_count = 0 + for corpus_file_path in corpus_file_path_list: + + node_count = len(node_freq_dict) + edge_count = len(edge_freq_dict) + + #prints update after every 11th of the corpus is parsed + if processed_file_count % int(corpus_size/11) == 0: - file_ratio = i/len(files[:]) - max_node_ratio = len(node_freq)/max_nodes - max_edge_ratio = len(edge_freq)/max_edges + file_ratio = processed_file_count / corpus_size + max_node_ratio = node_count / max_node_count + max_edge_ratio = edge_count / max_edge_count ratios = [file_ratio, max_node_ratio, max_edge_ratio] #uses the ratio closest to 100%. - percentage = int((max(ratios))*100) + highest_ratio = int((max(ratios))*100) - print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq))+'\t('+target+')') + print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio, + node_count, + edge_count, + bracketed_target_string)) #checks maximum node values - if len(node_freq) > max_nodes: - print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')') - return node_freq, edge_freq - - #checks maximum edge values - if len(edge_freq) > max_edges: - print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')') - return node_freq, edge_freq - - with open(f, 'r') as lines: #parses single file + if node_count > max_node_count: + print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, + edge_count, + bracketed_target_string)) + return node_freq_dict, edge_freq_dict + + if edge_count > max_edge_count: + print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, + edge_count, + bracketed_target_string)) + return node_freq_dict, edge_freq_dict + + with open(corpus_file_path, 'r') as corpus_file: - node_freq, edge_freq = process_file(lines, target, node_freq, edge_freq) + node_freq_dict, edge_freq_dict = process_file(corpus_file, + target_string, + node_freq_dict, + edge_freq_dict) - i += 1 + processed_file_count += 1 - #update print - print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')') + print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, + edge_count, + bracketed_target_string)) - return node_freq, edge_freq + return node_freq_dict, edge_freq_dict -def process_file(lines, target, node_freq=None, edge_freq=None): - if node_freq is None: - node_freq = dict() - if edge_freq is None: - edge_freq = dict() - - s_target = target.replace('_', ' ') #target word with spaces - - stop_words = set(stopwords.words('english') + config.stop_words) - allowed_tags = config.allowed_tags +def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): + """Updates the counts of nodes and edges for a given document and target. + + Ammends the input dictionaries with counts from each context withing the + list of contexts. Furthermore filters out small contexts and tokens from + the stopword list or with wrong pos tags. + + Args: + context_list: List of contexts (lines, paragraphs) that are to be + considered for updating the counting dictionaries. + target_string: Target string for filtering out every context that does + not contain it. + node_freq_dict: Dictionary of occurrences of every eligible token + within every context the target occurs in. + edge_freq_dict: Dictionary of occurrences of every eligible tuple of + tokens within every context the target occurs in. + + Returns: + node_freq_dict: Updated version of the input node dict. + edge_freq_dict: Updated version of the input edge dict. + """ + + spaced_target_string = target_string.replace('_', ' ') + + stopword_list = set(stopwords.words('english') + config.stop_words) + allowed_tag_list = config.allowed_tags min_context_size = config.min_context_size try: - for line in lines: #parses single paragraph - - line = line.lower() + for context in context_list: - if s_target in line: #greedy pre selection, not perfect + context = context.lower() + if spaced_target_string in context: #greedy pre selection, not perfect - tokens = set() #set of node candidates - doc = nlp(line.replace(s_target, target)) #nlp processing + token_set = set() #set of node candidates - if target in [t.text for t in doc]: #better selection + #This replacement allows target to be treated as single entity. + context = context.replace(spaced_target_string, target_string) + processed_context = nlp(context) + + if target_string in [token.text for token in processed_context]: - for tok in doc: - - text = tok.text #string value - tag = tok.tag_ #pos tag + for token in processed_context: #doesn't add target word to nodes - if text == target: + if token.text == target_string: pass #doesn't add stop words to nodes - elif text in stop_words: + elif token.text in stopword_list: pass #only adds tokens with allowed tags to nodes - elif tag in allowed_tags: - tokens.add(tok.text) + elif token.tag_ in allowed_tag_list: + token_set.add(token.text) - #if there are enough (good) tokens in paragraph - if len(tokens) >= min_context_size: - for token in tokens: + context_size = len(token_set) + + if context_size >= min_context_size: + for token in token_set: - #updates counts for nodes - if token in node_freq: - node_freq[token] += 1 + if token in node_freq_dict: + node_freq_dict[token] += 1 else: - node_freq[token] = 1 + node_freq_dict[token] = 1 - for edge in {(x,y) for x in tokens for y in tokens if x < y}: + #set of possible edges + for edge in {(x,y) for x in token_set for y in token_set if x < y}: - #updates counts for edges - if edge in edge_freq: - edge_freq[edge] += 1 + if edge in edge_freq_dict: + edge_freq_dict[edge] += 1 else: - edge_freq[edge] = 1 + edge_freq_dict[edge] = 1 #if a file is corrupted (can't always be catched with if-else) except UnicodeDecodeError: - pass - #print('Failed to decode:', f) + pass - return node_freq, edge_freq + return node_freq_dict, edge_freq_dict #build graph from frequency dictionaries -def build_graph(node_freq, edge_freq): +def build_graph(node_freq_dict, edge_freq_dict): + """Builds undirected weighted graph from dictionaries. + + Creates graph and appends every edge and node in the parameter dictionaries, + given they occur frequently enough. For every edge a weight is calculated. + + Args: + node_freq_dict: Dictionary of occurrences of every eligible token + within every context the target occurs in. + edge_freq_dict: Dictionary of occurrences of every eligible tuple of + tokens within every context the target occurs in. + + Returns: + cooccurence_graph: Filtered undirected dice weighted small word + cooccurence graph for a given target entity. + """ min_node_freq = config.min_node_freq min_edge_freq = config.min_edge_freq max_weight = config.max_weight - G = nx.Graph() + cooccurence_graph = nx.Graph() #node : node frequency - for key, value in node_freq.items(): + for node, frequency in node_freq_dict.items(): - if value >= min_node_freq: - G.add_node(key) + if frequency >= min_node_freq: + cooccurence_graph.add_node(node) #edge : edge frequency - for key, value in edge_freq.items(): + for node_tuple, frequency in edge_freq_dict.items(): + + if frequency < min_edge_freq: + + continue - if value < min_edge_freq: + elif node_tuple[0] not in cooccurence_graph.nodes: + continue - if key[0] not in G.nodes or key[1] not in G.nodes: + elif node_tuple[1] not in cooccurence_graph.nodes: + continue - weight = 1 - max(edge_freq[key]/node_freq[key[0]], edge_freq[key]/node_freq[key[1]]) - if weight <= max_weight: - G.add_edge(*key, weight=weight) + else: + + cooccurrence_frequency = edge_freq_dict[node_tuple] + node0_frequency = node_freq_dict[node_tuple[0]] + node1_frequency = node_freq_dict[node_tuple[1]] + + prob_0 = cooccurrence_frequency / node0_frequency + prob_1 = cooccurrence_frequency / node1_frequency + + #best_weight = 1 - max(prob_0, prob_1) + dice_weight = 1 - ((prob_0 + prob_1) / 2) + + if dice_weight <= max_weight: + + cooccurence_graph.add_edge(*node_tuple, weight=dice_weight) + + else: + + pass - return G + return cooccurence_graph #Identifies senses by choosing nodes with high degrees -def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): +def root_hubs(graph, edge_freq_dict, min_neighbors=4, theshold=0.8): min_neighbors = config.min_neighbors threshold = config.threshold @@ -177,7 +281,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): if G.degree[v] >= min_neighbors: - mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #most frequent neighbors + mfn = sorted(G.adj[v], key=lambda key: edge_freq_dict[v,key] if v < key else edge_freq_dict[key, v], reverse=True)[:min_neighbors] #most frequent neighbors if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold @@ -202,11 +306,11 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): #Components algorithm from Véronis (2004), converts graph for target into a MST -def components(graph, hubs, target): +def components(graph, hubs, target_string): G = deepcopy(graph) H = hubs #root hubs - t = target + t = target_string #G.add_node(t) #for h in H: @@ -246,12 +350,12 @@ def score(graph, from_node, to_node): # Basically Word Sense Disambiguation, matches context to sense -def disambiguate(mst, hubs, contexts, target): +def disambiguate(mst, hubs, contexts, target_string): - target = target.replace('_', ' ') + target_string = target_string.replace('_', ' ') T = mst #minimum spanning tree H = hubs #root hubs - C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts + C = [c.lower().strip().replace(target_string, '') for c in contexts] #cleaned up contexts score_dict = dict() #memoisation for scores mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function @@ -312,80 +416,73 @@ def disambiguate(mst, hubs, contexts, target): return mapping_dict -def draw_graph(G, name): - nx.draw_networkx(G,pos=nx.spring_layout(G), with_labels=True, node_size=40, font_size=9, node_color='#2D98DA') - plt.savefig('../figures/'+name+'.png', dpi=200, bbox_inches='tight') - plt.clf() # our main function, here the main stepps for word sense induction are called -def WSI(topic_id, topic_name, results): +def word_sense_induction(topic_id, topic_name, results): #buffer for useful information out_buffer = '\n' - #paths for input (corpus) and output(directory) - corpus_path = config.corpus - output_path = config.output + #path for output(directory) + output_path = './test/'#config.output #removes trailing new_lines - old_target = topic_name.strip() #original target + old_target_string = topic_name.strip() #original target - if old_target.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: + if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: return None - out_buffer += ("[A] Word sense induction for '"+old_target+"':\n") + out_buffer += ("[A] Word sense induction for '"+old_target_string+"':\n") #in topics longer than two words, the leading 'the' can generally be removed without changing the sense - if old_target[:4] == 'the_' and old_target.count('_') >= 2: + if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2: - target = old_target[4:] + target_string = old_target_string[4:] else: - target = old_target + target_string = old_target_string #writes headline for output files - f = open(output_path+target+'.absinth', 'w') + f = open(output_path+target_string+'.absinth', 'w') f.write('subTopicID\tresultID\n') #counts occurences of single words, as well as cooccurrences, saves it in dictionary - print('[a]', 'Counting nodes and edges.\t('+old_target+')') - node_freq, edge_freq = frequencies(corpus_path, target, results[topic_id]) + print('[a]', 'Counting nodes and edges.\t('+old_target_string+')') + node_freq_dict, edge_freq_dict = frequencies(target_string, results[topic_id]) #builds graph from these dictionaries, also applies multiple filters - print('[a]', 'Building graph.\t('+old_target+')') - G = build_graph(node_freq, edge_freq) - draw_graph(G, topic_name.strip()+'_g') + print('[a]', 'Building graph.\t('+old_target_string+')') + G = build_graph(node_freq_dict, edge_freq_dict) out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges))) #finds root hubs (senses) within the graph + more filters for these - print('[a]', 'Collecting root hubs.\t('+old_target+')') - H = root_hubs(G, edge_freq) + print('[a]', 'Collecting root hubs.\t('+old_target_string+')') + H = root_hubs(G, edge_freq_dict) out_buffer += '[A] Root hubs:\n' #adds sense inventory to buffer with some common neighbors for context i = 1 #sense index for h in H: - mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6] + mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6] out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn))) i += 1 #performs minimum_spanning_tree algorithm on graph - print('[a]', 'Building minimum spanning tree.\t('+old_target+')') - T = components(G, H, target) - draw_graph(T, topic_name.strip()+'_t') + print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')') + T = components(G, H, target_string) #matches senses to clusters - print('[a]', 'Disambiguating results.\t('+old_target+')') - D = disambiguate(T, H, results[topic_id], target) + print('[a]', 'Disambiguating results.\t('+old_target_string+')') + D = disambiguate(T, H, results[topic_id], target_string) out_buffer += ('[A] Mapping: \n') for cluster,results in D.items(): out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results]))) #prints buffer - print('[a]', 'Writing to file.\t('+old_target+')') + print('[a]', 'Writing to file.\t('+old_target_string+')') print(out_buffer) #writes clustering to file @@ -395,14 +492,8 @@ def WSI(topic_id, topic_name, results): f.close() - -if __name__ == '__main__': - # If absinth.py is run in test environment - if '-t' in sys.argv: - data_path = config.test - else: - data_path = config.dataset +def read_dataset(data_path): # results.txt includes the queries for a given target word results = dict() @@ -430,10 +521,27 @@ if __name__ == '__main__': l = line.split('\t') topics[l[0]] = l[1] - # multiprocessing - with Pool(5) as pool: - # calls WSI() for for topics at a time - pool.starmap(WSI, [(key, value, results) for key,value in topics.items()]) + return results, topics + + +def main(): + + # If absinth.py is run in test environment + if '-t' in sys.argv: + data_path = config.test + else: + data_path = config.dataset + + results, topics = read_dataset(data_path) + + with Pool(2) as pool: + parameter_list = [(topic_id, topic_name, results) + for topic_id,topic_name in topics.items()] + pool.starmap(word_sense_induction, parameter_list) - #for key, value in topics.items(): - #WSI(key, value, results) + #for topic_id,topic_name in topics.items(): + #word_sense_induction(topic_id,topic_name, results) + + +if __name__ == '__main__': + main() -- GitLab