diff --git a/src/absinth.py b/src/absinth.py index 7dcf5309b8aaddc065d8dba92da1a15d2cd2eeca..a347bd1e8864e5e34d4328b818fc547aed7fea1a 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -44,7 +44,7 @@ def frequencies(target_string, search_result_list): bracketed_target_string = '('+target_string+')' - # Remove unnecessary tokens from snippets + # Remove unnecessary tokens from snippets. _search_result_list = list() for r in search_result_list: r = r.replace('<b>', '') @@ -53,13 +53,12 @@ def frequencies(target_string, search_result_list): r = r.strip() _search_result_list.append(r) - #initialises frequencies with counts from results + # Initialise frequencies with counts from results. node_freq_dict, edge_freq_dict = process_file(_search_result_list, target_string, dict(), dict()) - #names of corpus files corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)] corpus_size = len(corpus_file_path_list) @@ -69,7 +68,7 @@ def frequencies(target_string, search_result_list): node_count = len(node_freq_dict) edge_count = len(edge_freq_dict) - #prints update after every 11th of the corpus is parsed + # Print update after every 11th of the corpus is parsed. if processed_file_count % int(corpus_size/11) == 0: file_ratio = processed_file_count / corpus_size @@ -78,7 +77,7 @@ def frequencies(target_string, search_result_list): ratios = [file_ratio, max_node_ratio, max_edge_ratio] - #uses the ratio closest to 100%. + # Use ratio closest to 100%. highest_ratio = int((max(ratios))*100) print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio, @@ -86,7 +85,6 @@ def frequencies(target_string, search_result_list): edge_count, bracketed_target_string)) - #checks maximum node values if node_count > max_node_count: print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count, edge_count, @@ -148,11 +146,11 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): for context in context_list: context = context.lower() - if spaced_target_string in context: #greedy pre selection, not perfect + if spaced_target_string in context: # Pre-select lines greedy. - token_set = set() #set of node candidates + token_set = set() - #This replacement allows target to be treated as single entity. + # Allow target to be treated as single entity. context = context.replace(spaced_target_string, target_string) processed_context = nlp(context) @@ -160,15 +158,15 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): for token in processed_context: - #doesn't add target word to nodes + # Do not add target word to nodes. if token.text == target_string: pass - #doesn't add stop words to nodes + # Do not add stop words to nodes. elif token.text in stopword_list: pass - #only adds tokens with allowed tags to nodes + # Add only tokens with allowed tags to nodes. elif token.tag_ in allowed_tag_list: token_set.add(token.text) @@ -190,14 +188,14 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): else: edge_freq_dict[edge] = 1 - #if a file is corrupted (can't always be catched with if-else) + # If file is corrupted (can't always be catched with if-else), ignore file. except UnicodeDecodeError: pass return node_freq_dict, edge_freq_dict -#build graph from frequency dictionaries + def build_graph(node_freq_dict, edge_freq_dict): """Builds undirected weighted graph from dictionaries. @@ -221,13 +219,11 @@ def build_graph(node_freq_dict, edge_freq_dict): cooccurence_graph = nx.Graph() - #node : node frequency for node, frequency in node_freq_dict.items(): if frequency >= min_node_freq: cooccurence_graph.add_node(node) - #edge : edge frequency for node_tuple, frequency in edge_freq_dict.items(): if frequency < min_edge_freq: @@ -265,166 +261,246 @@ def build_graph(node_freq_dict, edge_freq_dict): return cooccurence_graph -#Identifies senses by choosing nodes with high degrees -def root_hubs(graph, edge_freq_dict, min_neighbors=4, theshold=0.8): +def root_hubs(graph, edge_freq_dict): + """Identifies senses (root hubs) by choosing nodes with high degrees + + Selects root hubs according to the algorithm in Véronis (2004). Nodes with + high degree and neighbors with low weights (high cooccurence) are chosen + until there are no more viable candidates. A root hub candidate is every + node that is not already a hub and is not a neighbor of one. + + Args: + graph: Weighted undirected graph. + edge_freq_dict: Dictionary of weights for every tuple in our graph. + + Returns: + hub_list: List of root hubs, i.e. strings that are selected using the + algorithm explained above. + """ min_neighbors = config.min_neighbors threshold = config.threshold - G = deepcopy(graph) - V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # sorts according to degree - H = list() #output list + # Allow operations on graph without altering original one. + graph_copy = deepcopy(graph) + + # Sort according to degree (number of neighbors). + candidate_list = sorted(graph_copy.nodes, + key=lambda node: graph_copy.degree[node], + reverse=True) + + hub_list = list() - while V: + # While there are still candidates, search for root hubs. + while candidate_list: - v = V[0] #best hub candidate + candidate = candidate_list[0] #best hub candidate - if G.degree[v] >= min_neighbors: + if graph_copy.degree[candidate] >= min_neighbors: - mfn = sorted(G.adj[v], key=lambda key: edge_freq_dict[v,key] if v < key else edge_freq_dict[key, v], reverse=True)[:min_neighbors] #most frequent neighbors + by_frequency = lambda node: edge_freq_dict[candidate,node] \ + if candidate < node \ + else edge_freq_dict[node,candidate] + + most_frequent_neighbor_list = sorted(graph_copy.adj[candidate], + key=by_frequency, + reverse=True) [:min_neighbors] - if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold + # If the mean weight of the most frequent neighbors cooccur + # frequently enough with candidate, the candidate is approved. + if np.mean([graph_copy.edges[candidate,node]['weight'] + for node in most_frequent_neighbor_list]) < threshold: - H.append(v) + # Add candidate as root hub. + hub_list.append(candidate) - #removes neighbors of new hub as hub candidates - for nbr in deepcopy(G).adj[v]: + # Remove neighbors of new hub as hub candidates. + for neighbor in deepcopy(graph_copy).adj[candidate]: + graph_copy.remove_node(neighbor) - G.remove_node(nbr) - - #removes hub candidate - G.remove_node(v) + # Remove hub candidate. + graph_copy.remove_node(candidate) - #reorderd potential hubs after deletions - V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) + # Reorder potential hubs after deletions. + candidate_list = sorted(graph_copy.nodes, + key=lambda node: graph_copy.degree[node], + reverse=True) else: - return H + return hub_list - return H + return hub_list -#Components algorithm from Véronis (2004), converts graph for target into a MST -def components(graph, hubs, target_string): +def components(graph, root_hub_list, target_string): + """Builds minimum spanning tree from graph and removes singletons. - G = deepcopy(graph) - H = hubs #root hubs - t = target_string + Applies components algorithm from Véronis (2004) and removes singletons. - #G.add_node(t) - #for h in H: - #G.add_edge(t,h,weight=0) + Args: + graph: Undirected weighted graph. + root_hub_list: List of strings of root hubs of graph. + target_string: Root of minimum spanning tree. - T = nx.minimum_spanning_tree(G) + Returns: + minimum_spanning_tree: Minimum spanning tree with target as + root and root hubs as direct children. Singletons removed. + """ + + graph_copy = deepcopy(graph) + + graph_copy.add_node(target_string) + for root_hub in root_hub_list: + graph_copy.add_edge(target_string,root_hub,weight=0) + + minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy) - #removes singletons - for node in deepcopy(T).nodes: - if len(T.adj[node]) == 0: - T.remove_node(node) + # Remove singletons, deepcopy for iteration while being altered. + for node in deepcopy(minimum_spanning_tree).nodes: + if len(minimum_spanning_tree.adj[node]) == 0: + minimum_spanning_tree.remove_node(node) - return T + return minimum_spanning_tree -#Calculates score for a given path in a minimum spanning tree -def score(graph, from_node, to_node): +def score(graph, component, root_hub_list): + """Calculate score for a given component in a minimum spanning tree. - #if correct tree - if nx.has_path(graph, from_node, to_node): - - # calculates shortest path (approximation for path with lowest total weight) - path = nx.shortest_path(graph, from_node, to_node, 'weight') - total_weight = 0 + First the correct root for the component is chosen. If no root hub is + suitable, an empty array is returned. A score is calculated for the distance + of the component and its root and returned as part of an array filled with + zeroes. - #adds weights of every sub-path - for i in range(1, len(path)): - sub_from, sub_to = path[i-1], path[i] - total_weight += graph[sub_from][sub_to]['weight'] + Args: + graph: Minimum spanning tree. + component: Node (string) from which the distances are to be calculated. + root_hub_list: List of strings of root hubs (senses) of original graph. - #the further the path, the lower the score - return 1/(1+total_weight) - - else: - - return 0 + Returns: + score_array: Array with one score for the correct root hub and filled + with zeroes.. + """ + + root_hub_count = len(root_hub_list) + + #Initialise score array. + score_array = np.zeros(root_hub_count) + + # Find root of component. + distance_list = list() + for root_hub in root_hub_list: + if nx.has_path(graph, component, root_hub): + distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub)))) + else: + distance_list.append(0) + + if sum(distance_list) == 0: + return score_array + + root_idx = np.argmax(distance_list) + root = root_hub_list[root_idx] + + shortest_path = nx.shortest_path(graph, component, root, 'weight') + total_weight = 0 + + # Add weights of every sub-path. + for i in range(1, len(shortest_path)): + sub_from, sub_to = shortest_path[i-1], shortest_path[i] + total_weight += graph[sub_from][sub_to]['weight'] + score_array = np.zeros(root_hub_count) + score_array[root_idx] = 1/(1+total_weight) + + return score_array -# Basically Word Sense Disambiguation, matches context to sense -def disambiguate(mst, hubs, contexts, target_string): + +def disambiguate(minimum_spanning_tree, root_hub_list, + context_list, target_string): + """Matches contexts to senses. + + Adds up scores for each token in a context string and matches the context + to the root hub with the highest score. + + Args: + minimum_spanning_tree: Minimum spanning tree with target as root. + root_hub_list: List of strings of root hubs (senses). + context_list: List of sentence strings that are to be clustered. + target_string: String of target word, also root of MST. + + Returns: + mapping_dict: Dictionary of root hubs (senses) as keys and context ids + as values. + """ target_string = target_string.replace('_', ' ') - T = mst #minimum spanning tree - H = hubs #root hubs - C = [c.lower().strip().replace(target_string, '') for c in contexts] #cleaned up contexts + context_list = [context.lower().strip().replace(target_string, '') + for context in context_list] score_dict = dict() #memoisation for scores - mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function + mapping_dict = {topic:[] for topic in range(1,len(root_hub_list)+1)} #if no sense is found for a target word, we should assume that there only is one sense - if len(H) == 0: + if len(root_hub_list) == 0: - return {0:[i for i in range(1, len(C)+1)]} + return {0:[i for i in range(1, len(context_list)+1)]} idx = 0 - for c in C: + for context in context_list: idx += 1 #index based on position in list - doc = nlp(c) #parsed context - texts = [tok.text for tok in doc] #tokens + processed_context = nlp(context) + text_list = [token.text for token in processed_context] #tokens - scores = np.zeros(len(H)) #initialise with zeros for every sense + score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense - for text in texts: + for text in text_list: - if text in T.nodes: #if word wasn't filtered out - - new_scores = list() #scores to be added to total scores + if text in minimum_spanning_tree.nodes: #if word wasn't filtered out - for h in H: #for each hub - - if (text, h) in score_dict: #memoisation + if text in score_dict: #memoisation - new_scores.append(score_dict[(text,h)]) + new_scores = score_dict[text] + + else: - else: - - new_score = score(T, text, h) - new_scores.append(new_score) - score_dict[(text,h)] = new_score #memoisation + new_score = score(minimum_spanning_tree, + text, root_hub_list) + score_dict[text] = new_score #memoisation - scores = scores + np.array(new_scores) + score_array += new_score else: pass - #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing - if np.max(scores) == 0: + # If disambiguator does not detect a sense, return singleton. + if np.max(score_array) == 0: pass else: - #applies sense with the highest score to context - max_score = np.max(scores) - argmax_score = np.argmax(scores) + # Apply sense with the highest score to context + max_score = np.max(score_array) + argmax_score = np.argmax(score_array) - #clusters begin at 1 + # Clusters begin at 1 mapping_dict[argmax_score + 1].append(idx) return mapping_dict # our main function, here the main stepps for word sense induction are called -def word_sense_induction(topic_id, topic_name, results): +def word_sense_induction(topic_id, topic_name, result_list): #buffer for useful information out_buffer = '\n' #path for output(directory) - output_path = './test/'#config.output + output_path = config.output #removes trailing new_lines old_target_string = topic_name.strip() #original target @@ -449,7 +525,7 @@ def word_sense_induction(topic_id, topic_name, results): #counts occurences of single words, as well as cooccurrences, saves it in dictionary print('[a]', 'Counting nodes and edges.\t('+old_target_string+')') - node_freq_dict, edge_freq_dict = frequencies(target_string, results[topic_id]) + node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id]) #builds graph from these dictionaries, also applies multiple filters print('[a]', 'Building graph.\t('+old_target_string+')') @@ -474,20 +550,20 @@ def word_sense_induction(topic_id, topic_name, results): T = components(G, H, target_string) #matches senses to clusters - print('[a]', 'Disambiguating results.\t('+old_target_string+')') - D = disambiguate(T, H, results[topic_id], target_string) + print('[a]', 'Disambiguating result_list.\t('+old_target_string+')') + D = disambiguate(T, H, result_list[topic_id], target_string) out_buffer += ('[A] Mapping: \n') - for cluster,results in D.items(): - out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results]))) + for cluster,result_list in D.items(): + out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list]))) #prints buffer print('[a]', 'Writing to file.\t('+old_target_string+')') print(out_buffer) #writes clustering to file - for cluster,results in D.items(): - for result in results: + for cluster,result_list in D.items(): + for result in result_list: f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n') f.close() @@ -526,7 +602,7 @@ def read_dataset(data_path): def main(): - # If absinth.py is run in test environment + # If absinth.py is run in test environment. if '-t' in sys.argv: data_path = config.test else: @@ -534,7 +610,13 @@ def main(): results, topics = read_dataset(data_path) - with Pool(2) as pool: + # Enables manual setting of process count. + if '-p' in sys.argv: + process_count = int(sys.argv[sys.argv.index('-p') + 1]) + else: + process_count = 1 + + with Pool(process_count) as pool: parameter_list = [(topic_id, topic_name, results) for topic_id,topic_name in topics.items()] pool.starmap(word_sense_induction, parameter_list)