Skip to content
Snippets Groups Projects
Commit 2de73dc7 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Added multiprocessing.

parent 2a14d364
No related branches found
No related tags found
No related merge requests found
......@@ -7,6 +7,7 @@ from nltk.corpus import stopwords
import numpy as np # for calculations
import config
import spacy # for nlp
from multiprocessing import Pool
nlp = spacy.load('en') # standard english nlp
......@@ -22,13 +23,13 @@ def frequencies(corpus_path, target):
node_freq = dict()
edge_freq = dict()
files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
files = [corpus_path + f for f in os.listdir(corpus_path)]
s_target = target.replace('_', ' ') #target word with spaces
i = 0
for f in files:
if i % int(len(files)/23) == 0:
if i % int(len(files)/10) == 0:
file_ratio = i/len(files[:])
max_node_ratio = len(node_freq)/max_nodes
......@@ -36,7 +37,9 @@ def frequencies(corpus_path, target):
ratios = [file_ratio, max_node_ratio, max_edge_ratio]
print(' ~{}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq)))
percentage = int((max(ratios))*100)
print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target)
if len(node_freq) > max_nodes:
return node_freq, edge_freq
......@@ -95,7 +98,7 @@ def frequencies(corpus_path, target):
i += 1
print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)))
print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target)
return node_freq, edge_freq
......@@ -201,11 +204,12 @@ def score(graph, from_node, to_node):
return 0
def disambiguate(mst, hubs, contexts):
def disambiguate(mst, hubs, contexts, target=""):
target = target.replace('_', ' ')
T = mst
H = hubs
C = [c.lower().strip() for c in contexts]
C = [c.lower().strip().replace(target, '') for c in contexts]
score_dict = dict()
result = list()
......@@ -258,11 +262,65 @@ def disambiguate(mst, hubs, contexts):
return result
if __name__ == '__main__':
def WSI(topic_id, topic_name, results):
out_buffer = '\n'
corpus_path = config.corpus
data_path = config.dataset
output_path = config.output
old_target = topic_name.strip() #original target
out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
if old_target[:4] == 'the_' and old_target.count('_') >= 2: #hard coded 'the'-protection
target = old_target[4:]
else:
target = old_target
f = open(output_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
print('[a]', 'Counting nodes and edges.', old_target)
node_freq, edge_freq = frequencies(corpus_path, target)
out_buffer += '[A] Nodes: {}\tEdges:{}\n'.format(str(len(node_freq)), str(len(edge_freq)))
print('[a]', 'Building graph.', old_target)
G = build_graph(node_freq, edge_freq)
print('[a]', 'Collecting root hubs.', old_target)
H = root_hubs(G, edge_freq)
out_buffer += '[A] Root hubs:\n'
i = 1
for h in H:
mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
out_buffer += (' {}. {}: {}\n'.format(i, h, mfn))
i += 1
print('[a]', 'Building minimum spanning tree.', old_target)
T = components(G, H, target)
print('[a]', 'Disambiguating results.', old_target)
D = disambiguate(T, H, results[topic_id], target)
out_buffer += ('[A] Mapping: '+ str(D) + '\n')
print('[a]', 'Writing to file.', old_target)
print(out_buffer)
for d in D:
f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n')
f.close()
if __name__ == '__main__':
data_path = config.dataset
results = dict()
......@@ -287,48 +345,8 @@ if __name__ == '__main__':
l = line.split('\t')
topics[l[0]] = l[1]
for key, value in topics.items():
o_target = value.strip() #original target
print("[A] Processing '"+o_target+"'.\n")
if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
target = o_target[4:]
else:
target = o_target
f = open(output_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
print('[A] Counting Tokens...')
node_freq, edge_freq = frequencies(corpus_path, target)
print('\n[A] Building Graph.\n')
G = build_graph(node_freq, edge_freq)
print('[A] Collecting Root Hubs...')
H = root_hubs(G, edge_freq)
for h in H:
mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
print(' {}: {}'.format(h, mfn))
print('\n[A] Building Minimum Spanning Tree.\n')
T = components(G, H, target)
print('[A] Disambiguating Results...')
D = disambiguate(T, H, results[key])
print(' Mapping:', D, '\n')
print('[A] Writing to file '+o_target+'.absinth.\n\n')
for d in D:
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
f.close()
with Pool(4) as pool:
pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
#for key, value in topics.items():
# WSI(key, value, results)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment