diff --git a/src/absinth.py b/src/absinth.py index 144cb203c83fe9312b104c16cf6a57c867cf7b1c..00f49f78582abd3aaf12d73ff78d29742a823eff 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -8,13 +8,15 @@ import numpy as np # for calculations import config import spacy # for nlp from multiprocessing import Pool +import random nlp = spacy.load('en') # standard english nlp - #counts occurences of nodes and cooccurrences def frequencies(corpus_path, target): + random.seed(1) + stop_words = set(stopwords.words('english') + config.stop_words) allowed_tags = config.allowed_tags min_context_size = config.min_context_size @@ -24,13 +26,15 @@ def frequencies(corpus_path, target): node_freq = dict() #counts (potential) nodes edge_freq = dict() #counts (potential) edges - files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files s_target = target.replace('_', ' ') #target word with spaces + files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files + + random.shuffle(files) i = 0 #for update print statements for f in files: - if i % int(len(files)/10) == 0: #prints update after every 10th of the corpus is parsed + if i % int(len(files)/11) == 0: #prints update after every 10th of the corpus is parsed file_ratio = i/len(files[:]) max_node_ratio = len(node_freq)/max_nodes @@ -41,7 +45,7 @@ def frequencies(corpus_path, target): #uses the ratio closest to 100%. percentage = int((max(ratios))*100) - print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target) + print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq))+'\t('+target+')') #checks maximum node values if len(node_freq) > max_nodes: @@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""): C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts score_dict = dict() #memoisation for scores - result = list() #output of function - + mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function + + #if no sense is found for a target word, we should assume that there only is one sense + if len(H) == 0: + + return {0:[i for i in range(1, len(C)+1)]} + for c in C: idx = C.index(c) + 1 #index based on position in list + + doc = nlp(c) #parsed context + texts = [tok.text for tok in doc] #tokens - #if no sense is found for a target word, we should assume that there only is one sense - if len(H) == 0: - - result.append((1, idx, 0)) + scores = np.zeros(len(H)) #initialise with zeros for every sense - else: - - doc = nlp(c) #parsed context - texts = [tok.text for tok in doc] #tokens + for text in texts: - scores = np.zeros(len(H)) #initialise with zeros for every sense - - for text in texts: + if text in T.nodes: #if word wasn't filtered out - if text in T.nodes: #if word wasn't filtered out - - new_scores = list() #scores to be added to total scores + new_scores = list() #scores to be added to total scores + + for h in H: #for each hub - for h in H: #for each hub - - if (text, h) in score_dict: #memoisation - - new_scores.append(score_dict[(text,h)]) + if (text, h) in score_dict: #memoisation - else: - - new_score = score(T, text, h) - new_scores.append(new_score) - score_dict[(text,h)] = new_score #memoisation + new_scores.append(score_dict[(text,h)]) + + else: - scores = scores + np.array(new_scores) - - else: - - pass + new_score = score(T, text, h) + new_scores.append(new_score) + score_dict[(text,h)] = new_score #memoisation + + scores = scores + np.array(new_scores) - #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing - if np.max(scores) == 0: + else: pass + + #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing + if np.max(scores) == 0: - else: - - #applies sense with the highest score to context - max_score = np.max(scores) - argmax_score = np.argmax(scores) + pass + + else: - #clusters begin at 1 - result.append((argmax_score + 1, idx)) + #applies sense with the highest score to context + max_score = np.max(scores) + argmax_score = np.argmax(scores) + + #clusters begin at 1 + mapping_dict[argmax_score + 1].append(idx) - return result + return mapping_dict # our main function, here the main stepps for word sense induction are called @@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results): f.write('subTopicID\tresultID\n') #counts occurences of single words, as well as cooccurrences, saves it in dictionary - print('[a]', 'Counting nodes and edges.', old_target) + print('[a]', 'Counting nodes and edges.\t('+old_target+')') node_freq, edge_freq = frequencies(corpus_path, target) out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq))) #builds graph from these dictionaries, also applies multiple filters - print('[a]', 'Building graph.', old_target) + print('[a]', 'Building graph.\t('+old_target+')') G = build_graph(node_freq, edge_freq) #finds root hubs (senses) within the graph + more filters for these - print('[a]', 'Collecting root hubs.', old_target) + print('[a]', 'Collecting root hubs.\t('+old_target+')') H = root_hubs(G, edge_freq) out_buffer += '[A] Root hubs:\n' @@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results): for h in H: mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6] - out_buffer += (' {}. {}: {}\n'.format(i, h, mfn)) + out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn))) i += 1 #performs minimum_spanning_tree algorithm on graph - print('[a]', 'Building minimum spanning tree.', old_target) + print('[a]', 'Building minimum spanning tree.\t('+old_target+')') T = components(G, H, target) #matches senses to clusters - print('[a]', 'Disambiguating results.', old_target) + print('[a]', 'Disambiguating results.\t('+old_target+')') D = disambiguate(T, H, results[topic_id], target) - out_buffer += ('[A] Mapping: '+ str(D) + '\n') + + out_buffer += ('[A] Mapping: \n') + for cluster,results in D.items(): + out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results]))) #prints buffer - print('[a]', 'Writing to file.', old_target) + print('[a]', 'Writing to file.\t('+old_target+')') print(out_buffer) #writes clustering to file - for d in D: - - f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n') + for cluster,results in D.items(): + for result in results: + f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n') f.close() @@ -394,13 +399,15 @@ if __name__ == '__main__': # topics.txt is a list of target words topics = dict() + processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)] with open(data_path+'topics.txt', 'r') as topics_file: for line in topics_file.readlines()[1:]: l = line.split('\t') - topics[l[0]] = l[1] + if l[1].strip() not in processed_topics: + topics[l[0]] = l[1] # multiprocessing with Pool(4) as pool: