Skip to content
Snippets Groups Projects
Commit 2de73dc7 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Added multiprocessing.

parent 2a14d364
No related branches found
No related tags found
No related merge requests found
...@@ -7,6 +7,7 @@ from nltk.corpus import stopwords ...@@ -7,6 +7,7 @@ from nltk.corpus import stopwords
import numpy as np # for calculations import numpy as np # for calculations
import config import config
import spacy # for nlp import spacy # for nlp
from multiprocessing import Pool
nlp = spacy.load('en') # standard english nlp nlp = spacy.load('en') # standard english nlp
...@@ -22,13 +23,13 @@ def frequencies(corpus_path, target): ...@@ -22,13 +23,13 @@ def frequencies(corpus_path, target):
node_freq = dict() node_freq = dict()
edge_freq = dict() edge_freq = dict()
files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] files = [corpus_path + f for f in os.listdir(corpus_path)]
s_target = target.replace('_', ' ') #target word with spaces s_target = target.replace('_', ' ') #target word with spaces
i = 0 i = 0
for f in files: for f in files:
if i % int(len(files)/23) == 0: if i % int(len(files)/10) == 0:
file_ratio = i/len(files[:]) file_ratio = i/len(files[:])
max_node_ratio = len(node_freq)/max_nodes max_node_ratio = len(node_freq)/max_nodes
...@@ -36,7 +37,9 @@ def frequencies(corpus_path, target): ...@@ -36,7 +37,9 @@ def frequencies(corpus_path, target):
ratios = [file_ratio, max_node_ratio, max_edge_ratio] ratios = [file_ratio, max_node_ratio, max_edge_ratio]
print(' ~{}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq))) percentage = int((max(ratios))*100)
print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target)
if len(node_freq) > max_nodes: if len(node_freq) > max_nodes:
return node_freq, edge_freq return node_freq, edge_freq
...@@ -95,7 +98,7 @@ def frequencies(corpus_path, target): ...@@ -95,7 +98,7 @@ def frequencies(corpus_path, target):
i += 1 i += 1
print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))) print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target)
return node_freq, edge_freq return node_freq, edge_freq
...@@ -201,11 +204,12 @@ def score(graph, from_node, to_node): ...@@ -201,11 +204,12 @@ def score(graph, from_node, to_node):
return 0 return 0
def disambiguate(mst, hubs, contexts): def disambiguate(mst, hubs, contexts, target=""):
target = target.replace('_', ' ')
T = mst T = mst
H = hubs H = hubs
C = [c.lower().strip() for c in contexts] C = [c.lower().strip().replace(target, '') for c in contexts]
score_dict = dict() score_dict = dict()
result = list() result = list()
...@@ -258,11 +262,65 @@ def disambiguate(mst, hubs, contexts): ...@@ -258,11 +262,65 @@ def disambiguate(mst, hubs, contexts):
return result return result
if __name__ == '__main__': def WSI(topic_id, topic_name, results):
out_buffer = '\n'
corpus_path = config.corpus corpus_path = config.corpus
data_path = config.dataset
output_path = config.output output_path = config.output
old_target = topic_name.strip() #original target
out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
if old_target[:4] == 'the_' and old_target.count('_') >= 2: #hard coded 'the'-protection
target = old_target[4:]
else:
target = old_target
f = open(output_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
print('[a]', 'Counting nodes and edges.', old_target)
node_freq, edge_freq = frequencies(corpus_path, target)
out_buffer += '[A] Nodes: {}\tEdges:{}\n'.format(str(len(node_freq)), str(len(edge_freq)))
print('[a]', 'Building graph.', old_target)
G = build_graph(node_freq, edge_freq)
print('[a]', 'Collecting root hubs.', old_target)
H = root_hubs(G, edge_freq)
out_buffer += '[A] Root hubs:\n'
i = 1
for h in H:
mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
out_buffer += (' {}. {}: {}\n'.format(i, h, mfn))
i += 1
print('[a]', 'Building minimum spanning tree.', old_target)
T = components(G, H, target)
print('[a]', 'Disambiguating results.', old_target)
D = disambiguate(T, H, results[topic_id], target)
out_buffer += ('[A] Mapping: '+ str(D) + '\n')
print('[a]', 'Writing to file.', old_target)
print(out_buffer)
for d in D:
f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n')
f.close()
if __name__ == '__main__':
data_path = config.dataset
results = dict() results = dict()
...@@ -287,48 +345,8 @@ if __name__ == '__main__': ...@@ -287,48 +345,8 @@ if __name__ == '__main__':
l = line.split('\t') l = line.split('\t')
topics[l[0]] = l[1] topics[l[0]] = l[1]
for key, value in topics.items():
o_target = value.strip() #original target
print("[A] Processing '"+o_target+"'.\n")
if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
target = o_target[4:]
else:
target = o_target
f = open(output_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
print('[A] Counting Tokens...')
node_freq, edge_freq = frequencies(corpus_path, target)
print('\n[A] Building Graph.\n')
G = build_graph(node_freq, edge_freq)
print('[A] Collecting Root Hubs...')
H = root_hubs(G, edge_freq)
for h in H:
mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
print(' {}: {}'.format(h, mfn))
print('\n[A] Building Minimum Spanning Tree.\n')
T = components(G, H, target)
print('[A] Disambiguating Results...') with Pool(4) as pool:
D = disambiguate(T, H, results[key]) pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
print(' Mapping:', D, '\n') #for key, value in topics.items():
# WSI(key, value, results)
print('[A] Writing to file '+o_target+'.absinth.\n\n')
for d in D:
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
f.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment