diff --git a/src/absinth.py b/src/absinth.py index 12484a8fd3a7aa7fdbaeed69887ca9543c00e72e..144cb203c83fe9312b104c16cf6a57c867cf7b1c 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -12,6 +12,7 @@ from multiprocessing import Pool nlp = spacy.load('en') # standard english nlp +#counts occurences of nodes and cooccurrences def frequencies(corpus_path, target): stop_words = set(stopwords.words('english') + config.stop_words) @@ -20,16 +21,16 @@ def frequencies(corpus_path, target): max_nodes = config.max_nodes max_edges = config.max_edges - node_freq = dict() - edge_freq = dict() + node_freq = dict() #counts (potential) nodes + edge_freq = dict() #counts (potential) edges - files = [corpus_path + f for f in os.listdir(corpus_path)] + files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files s_target = target.replace('_', ' ') #target word with spaces - i = 0 + i = 0 #for update print statements for f in files: - if i % int(len(files)/10) == 0: + if i % int(len(files)/10) == 0: #prints update after every 10th of the corpus is parsed file_ratio = i/len(files[:]) max_node_ratio = len(node_freq)/max_nodes @@ -37,48 +38,56 @@ def frequencies(corpus_path, target): ratios = [file_ratio, max_node_ratio, max_edge_ratio] + #uses the ratio closest to 100%. percentage = int((max(ratios))*100) print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target) + #checks maximum node values if len(node_freq) > max_nodes: return node_freq, edge_freq + #checks maximum edge values if len(edge_freq) > max_edges: return node_freq, edge_freq - with open(f, 'r') as lines: + with open(f, 'r') as lines: #parses single file try: - for line in lines: + for line in lines: #parses single paragraph line = line.lower() - if s_target in line: + if s_target in line: #greedy pre selection, not perfect - tokens = set() - doc = nlp(line.replace(s_target, target)) + tokens = set() #set of node candidates + doc = nlp(line.replace(s_target, target)) #nlp processing - if target in [t.text for t in doc]: + if target in [t.text for t in doc]: #better selection for tok in doc: - text = tok.text - tag = tok.tag_ + text = tok.text #string value + tag = tok.tag_ #pos tag + #doesn't add target word to nodes if text == target: pass + #doesn't add stop words to nodes elif text in stop_words: pass + #only adds tokens with allowed tags to nodes elif tag in allowed_tags: tokens.add(tok.text) + #if there are enough (good) tokens in paragraph if len(tokens) >= min_context_size: for token in tokens: + #updates counts for nodes if token in node_freq: node_freq[token] += 1 else: @@ -86,11 +95,13 @@ def frequencies(corpus_path, target): for edge in {(x,y) for x in tokens for y in tokens if x < y}: + #updates counts for edges if edge in edge_freq: edge_freq[edge] += 1 else: edge_freq[edge] = 1 + #if a file is corrupted (can't always be catched with if-else) except UnicodeDecodeError: pass @@ -98,10 +109,13 @@ def frequencies(corpus_path, target): i += 1 + #update print print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target) + return node_freq, edge_freq +#build graph from frequency dictionaries def build_graph(node_freq, edge_freq): min_node_freq = config.min_node_freq @@ -110,11 +124,13 @@ def build_graph(node_freq, edge_freq): G = nx.Graph() + #node : node frequency for key, value in node_freq.items(): if value >= min_node_freq: G.add_node(key) + #edge : edge frequency for key, value in edge_freq.items(): if value < min_edge_freq: @@ -130,33 +146,37 @@ def build_graph(node_freq, edge_freq): return G +#Identifies senses by choosing nodes with high degrees def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): min_neighbors = config.min_neighbors threshold = config.threshold G = deepcopy(graph) - V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) - H = list() + V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # sorts according to degree + H = list() #output list while V: - v = V[0] + v = V[0] #best hub candidate if G.degree[v] >= min_neighbors: - mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #mfn: most frequent neighbors + mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #most frequent neighbors - if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: + if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold H.append(v) + #removes neighbors of new hub as hub candidates for nbr in deepcopy(G).adj[v]: G.remove_node(nbr) + #removes hub candidate G.remove_node(v) + #reorderd potential hubs after deletions V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) else: @@ -170,7 +190,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): def components(graph, hubs, target): G = deepcopy(graph) - H = hubs + H = hubs #root hubs t = target #G.add_node(t) @@ -179,6 +199,7 @@ def components(graph, hubs, target): T = nx.minimum_spanning_tree(G) + #removes singletons for node in deepcopy(T).nodes: if len(T.adj[node]) == 0: T.remove_node(node) @@ -186,17 +207,22 @@ def components(graph, hubs, target): return T +#Calculates score for a given path in a minimum spanning tree def score(graph, from_node, to_node): + #if correct tree if nx.has_path(graph, from_node, to_node): + # calculates shortest path (approximation for path with lowest total weight) path = nx.shortest_path(graph, from_node, to_node, 'weight') total_weight = 0 + #adds weights of every sub-path for i in range(1, len(path)): sub_from, sub_to = path[i-1], path[i] total_weight += graph[sub_from][sub_to]['weight'] + #the further the path, the lower the score return 1/(1+total_weight) else: @@ -204,47 +230,52 @@ def score(graph, from_node, to_node): return 0 +# Basically Word Sense Disambiguation, matches context to sense def disambiguate(mst, hubs, contexts, target=""): target = target.replace('_', ' ') - T = mst - H = hubs - C = [c.lower().strip().replace(target, '') for c in contexts] + T = mst #minimum spanning tree + H = hubs #root hubs + C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts - score_dict = dict() - result = list() + score_dict = dict() #memoisation for scores + result = list() #output of function for c in C: - idx = C.index(c) + 1 + idx = C.index(c) + 1 #index based on position in list #if no sense is found for a target word, we should assume that there only is one sense - if len(H) == 0: + if len(H) == 0: - result.append((1, idx)) + result.append((1, idx, 0)) else: - doc = nlp(c) - texts = [tok.text for tok in doc] + doc = nlp(c) #parsed context + texts = [tok.text for tok in doc] #tokens scores = np.zeros(len(H)) #initialise with zeros for every sense for text in texts: - if text in T.nodes: + if text in T.nodes: #if word wasn't filtered out - new_scores = list() + new_scores = list() #scores to be added to total scores - for h in H: - if (text, h) in score_dict: + for h in H: #for each hub + + if (text, h) in score_dict: #memoisation + new_scores.append(score_dict[(text,h)]) + else: + new_score = score(T, text, h) new_scores.append(new_score) - score_dict[(text,h)] = new_scores + score_dict[(text,h)] = new_score #memoisation - scores = np.add(scores, new_scores) + scores = scores + np.array(new_scores) else: @@ -257,22 +288,32 @@ def disambiguate(mst, hubs, contexts, target=""): else: - result.append((np.argmax(scores)+1, idx)) + #applies sense with the highest score to context + max_score = np.max(scores) + argmax_score = np.argmax(scores) + + #clusters begin at 1 + result.append((argmax_score + 1, idx)) return result +# our main function, here the main stepps for word sense induction are called def WSI(topic_id, topic_name, results): + #buffer for useful information out_buffer = '\n' + #paths for input (corpus) and output(directory) corpus_path = config.corpus output_path = config.output + #removes trailing new_lines old_target = topic_name.strip() #original target out_buffer += ("[A] Word sense induction for '"+old_target+"':\n") - if old_target[:4] == 'the_' and old_target.count('_') >= 2: #hard coded 'the'-protection + #in topics longer than two words, the leading 'the' can generally be removed without changing the sense + if old_target[:4] == 'the_' and old_target.count('_') >= 2: target = old_target[4:] @@ -280,37 +321,46 @@ def WSI(topic_id, topic_name, results): target = old_target + #writes headline for output files f = open(output_path+target+'.absinth', 'w') f.write('subTopicID\tresultID\n') + #counts occurences of single words, as well as cooccurrences, saves it in dictionary print('[a]', 'Counting nodes and edges.', old_target) node_freq, edge_freq = frequencies(corpus_path, target) - out_buffer += '[A] Nodes: {}\tEdges:{}\n'.format(str(len(node_freq)), str(len(edge_freq))) + out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq))) + #builds graph from these dictionaries, also applies multiple filters print('[a]', 'Building graph.', old_target) G = build_graph(node_freq, edge_freq) + #finds root hubs (senses) within the graph + more filters for these print('[a]', 'Collecting root hubs.', old_target) H = root_hubs(G, edge_freq) out_buffer += '[A] Root hubs:\n' - i = 1 + #adds sense inventory to buffer with some common neighbors for context + i = 1 #sense index for h in H: mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6] out_buffer += (' {}. {}: {}\n'.format(i, h, mfn)) i += 1 + #performs minimum_spanning_tree algorithm on graph print('[a]', 'Building minimum spanning tree.', old_target) T = components(G, H, target) + #matches senses to clusters print('[a]', 'Disambiguating results.', old_target) D = disambiguate(T, H, results[topic_id], target) out_buffer += ('[A] Mapping: '+ str(D) + '\n') + #prints buffer print('[a]', 'Writing to file.', old_target) print(out_buffer) + #writes clustering to file for d in D: f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n') @@ -320,8 +370,13 @@ def WSI(topic_id, topic_name, results): if __name__ == '__main__': - data_path = config.dataset + # If absinth.py is run in test environment + if '-t' in sys.argv: + data_path = config.test + else: + data_path = config.dataset + # results.txt includes the queries for a given target word results = dict() with open(data_path+'results.txt', 'r') as results_file: @@ -329,14 +384,15 @@ if __name__ == '__main__': for line in results_file.readlines()[1:]: l = line.split('\t') - id1, _ = l[0].split('.') + id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index if id1 not in results: results[id1]=list() - results[id1].append(" ".join(l[2:])) - + results[id1].append(" ".join(l[2:])) # here I join title and snippet, the URL is ignored + + # topics.txt is a list of target words topics = dict() with open(data_path+'topics.txt', 'r') as topics_file: @@ -346,7 +402,10 @@ if __name__ == '__main__': l = line.split('\t') topics[l[0]] = l[1] + # multiprocessing with Pool(4) as pool: + # calls WSI() for for topics at a time pool.starmap(WSI, [(key, value, results) for key,value in topics.items()]) + #for key, value in topics.items(): # WSI(key, value, results)