diff --git a/src/absinth.py b/src/absinth.py index 75734d73c0d13d4e13f4be3a377ea94ecac7b281..12484a8fd3a7aa7fdbaeed69887ca9543c00e72e 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -7,6 +7,7 @@ from nltk.corpus import stopwords import numpy as np # for calculations import config import spacy # for nlp +from multiprocessing import Pool nlp = spacy.load('en') # standard english nlp @@ -22,13 +23,13 @@ def frequencies(corpus_path, target): node_freq = dict() edge_freq = dict() - files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] + files = [corpus_path + f for f in os.listdir(corpus_path)] s_target = target.replace('_', ' ') #target word with spaces i = 0 for f in files: - if i % int(len(files)/23) == 0: + if i % int(len(files)/10) == 0: file_ratio = i/len(files[:]) max_node_ratio = len(node_freq)/max_nodes @@ -36,7 +37,9 @@ def frequencies(corpus_path, target): ratios = [file_ratio, max_node_ratio, max_edge_ratio] - print(' ~{}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq))) + percentage = int((max(ratios))*100) + + print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target) if len(node_freq) > max_nodes: return node_freq, edge_freq @@ -95,7 +98,7 @@ def frequencies(corpus_path, target): i += 1 - print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))) + print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target) return node_freq, edge_freq @@ -201,11 +204,12 @@ def score(graph, from_node, to_node): return 0 -def disambiguate(mst, hubs, contexts): +def disambiguate(mst, hubs, contexts, target=""): + target = target.replace('_', ' ') T = mst H = hubs - C = [c.lower().strip() for c in contexts] + C = [c.lower().strip().replace(target, '') for c in contexts] score_dict = dict() result = list() @@ -258,11 +262,65 @@ def disambiguate(mst, hubs, contexts): return result -if __name__ == '__main__': +def WSI(topic_id, topic_name, results): + + out_buffer = '\n' corpus_path = config.corpus - data_path = config.dataset output_path = config.output + + old_target = topic_name.strip() #original target + out_buffer += ("[A] Word sense induction for '"+old_target+"':\n") + + if old_target[:4] == 'the_' and old_target.count('_') >= 2: #hard coded 'the'-protection + + target = old_target[4:] + + else: + + target = old_target + + f = open(output_path+target+'.absinth', 'w') + f.write('subTopicID\tresultID\n') + + print('[a]', 'Counting nodes and edges.', old_target) + node_freq, edge_freq = frequencies(corpus_path, target) + out_buffer += '[A] Nodes: {}\tEdges:{}\n'.format(str(len(node_freq)), str(len(edge_freq))) + + print('[a]', 'Building graph.', old_target) + G = build_graph(node_freq, edge_freq) + + print('[a]', 'Collecting root hubs.', old_target) + H = root_hubs(G, edge_freq) + out_buffer += '[A] Root hubs:\n' + + i = 1 + for h in H: + + mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6] + out_buffer += (' {}. {}: {}\n'.format(i, h, mfn)) + i += 1 + + print('[a]', 'Building minimum spanning tree.', old_target) + T = components(G, H, target) + + print('[a]', 'Disambiguating results.', old_target) + D = disambiguate(T, H, results[topic_id], target) + out_buffer += ('[A] Mapping: '+ str(D) + '\n') + + print('[a]', 'Writing to file.', old_target) + print(out_buffer) + + for d in D: + + f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n') + + f.close() + + +if __name__ == '__main__': + + data_path = config.dataset results = dict() @@ -287,48 +345,8 @@ if __name__ == '__main__': l = line.split('\t') topics[l[0]] = l[1] - - for key, value in topics.items(): - - o_target = value.strip() #original target - print("[A] Processing '"+o_target+"'.\n") - - if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection - - target = o_target[4:] - - else: - - target = o_target - - f = open(output_path+target+'.absinth', 'w') - f.write('subTopicID\tresultID\n') - - print('[A] Counting Tokens...') - node_freq, edge_freq = frequencies(corpus_path, target) - - print('\n[A] Building Graph.\n') - G = build_graph(node_freq, edge_freq) - - print('[A] Collecting Root Hubs...') - H = root_hubs(G, edge_freq) - - for h in H: - - mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6] - print(' {}: {}'.format(h, mfn)) - - print('\n[A] Building Minimum Spanning Tree.\n') - T = components(G, H, target) - print('[A] Disambiguating Results...') - D = disambiguate(T, H, results[key]) - print(' Mapping:', D, '\n') - - print('[A] Writing to file '+o_target+'.absinth.\n\n') - - for d in D: - - f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') - - f.close() + with Pool(4) as pool: + pool.starmap(WSI, [(key, value, results) for key,value in topics.items()]) + #for key, value in topics.items(): + # WSI(key, value, results)