Added multiprocessing.

2de73dc7 · Victor Zimmermann · 2a14d364 · 2de73dc7
Commit 2de73dc7 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -7,6 +7,7 @@ from nltk.corpus import stopwords
 import numpy as np # for calculations
 import config
 import spacy # for nlp
+from multiprocessing import Pool

 nlp = spacy.load('en') # standard english nlp

@@ -22,13 +23,13 @@ def frequencies(corpus_path, target):
    node_freq = dict()
    edge_freq = dict()
    
-    files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
+    files = [corpus_path + f for f in os.listdir(corpus_path)]
    s_target = target.replace('_', ' ') #target word with spaces
    
    i = 0
    for f in files:
        
-        if i % int(len(files)/23) == 0:
+        if i % int(len(files)/10) == 0:
            
            file_ratio = i/len(files[:])
            max_node_ratio = len(node_freq)/max_nodes
@@ -36,7 +37,9 @@ def frequencies(corpus_path, target):
            
            ratios = [file_ratio, max_node_ratio, max_edge_ratio]
            
-            print(' ~{}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq)))
+            percentage = int((max(ratios))*100)
+            
+            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target)
        
        if len(node_freq) > max_nodes:
            return node_freq, edge_freq
@@ -95,7 +98,7 @@ def frequencies(corpus_path, target):
        
        i += 1
    
-    print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)))
+    print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target)
    return node_freq, edge_freq


@@ -201,11 +204,12 @@ def score(graph, from_node, to_node):
        return 0


-def disambiguate(mst, hubs, contexts):
+def disambiguate(mst, hubs, contexts, target=""):
    
+    target = target.replace('_', ' ')
    T = mst
    H = hubs
-    C = [c.lower().strip() for c in contexts]
+    C = [c.lower().strip().replace(target, '') for c in contexts]
    
    score_dict = dict()
    result = list()
@@ -258,11 +262,65 @@ def disambiguate(mst, hubs, contexts):
    return result


-if __name__ == '__main__':
+def WSI(topic_id, topic_name, results):
+    
+    out_buffer = '\n'
    
    corpus_path = config.corpus
-    data_path = config.dataset
    output_path = config.output
+            
+    old_target = topic_name.strip() #original target
+    out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
+    
+    if old_target[:4] == 'the_' and old_target.count('_') >= 2: #hard coded 'the'-protection
+        
+        target = old_target[4:]
+        
+    else:
+        
+        target = old_target
+    
+    f = open(output_path+target+'.absinth', 'w')
+    f.write('subTopicID\tresultID\n')
+    
+    print('[a]', 'Counting nodes and edges.', old_target)
+    node_freq, edge_freq = frequencies(corpus_path, target)
+    out_buffer += '[A] Nodes: {}\tEdges:{}\n'.format(str(len(node_freq)), str(len(edge_freq)))
+    
+    print('[a]', 'Building graph.', old_target)
+    G = build_graph(node_freq, edge_freq)
+    
+    print('[a]', 'Collecting root hubs.', old_target)
+    H = root_hubs(G, edge_freq)
+    out_buffer += '[A] Root hubs:\n'
+    
+    i = 1
+    for h in H:
+        
+        mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
+        out_buffer += (' {}. {}: {}\n'.format(i, h, mfn))
+        i += 1
+    
+    print('[a]', 'Building minimum spanning tree.', old_target)
+    T = components(G, H, target)
+
+    print('[a]', 'Disambiguating results.', old_target)
+    D = disambiguate(T, H, results[topic_id], target)
+    out_buffer += ('[A] Mapping: '+ str(D) + '\n')
+    
+    print('[a]', 'Writing to file.', old_target)
+    print(out_buffer)
+    
+    for d in D:
+        
+        f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n')
+        
+    f.close()
+    
+
+if __name__ == '__main__':
+    
+    data_path = config.dataset
    
    results = dict()
    
@@ -287,48 +345,8 @@ if __name__ == '__main__':
            
            l = line.split('\t')
            topics[l[0]] = l[1]
-        
-    for key, value in topics.items():
-            
-        o_target = value.strip() #original target
-        print("[A] Processing '"+o_target+"'.\n")
-        
-        if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
-            
-            target = o_target[4:]
-            
-        else:
-            
-            target = o_target
-        
-        f = open(output_path+target+'.absinth', 'w')
-        f.write('subTopicID\tresultID\n')
-        
-        print('[A] Counting Tokens...')
-        node_freq, edge_freq = frequencies(corpus_path, target)
-        
-        print('\n[A] Building Graph.\n')
-        G = build_graph(node_freq, edge_freq)
-        
-        print('[A] Collecting Root Hubs...')
-        H = root_hubs(G, edge_freq)
-        
-        for h in H:
-            
-            mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
-            print(' {}: {}'.format(h, mfn))
-        
-        print('\n[A] Building Minimum Spanning Tree.\n')
-        T = components(G, H, target)
    
-        print('[A] Disambiguating Results...')
-        D = disambiguate(T, H, results[key])
-        print(' Mapping:', D, '\n')
-        
-        print('[A] Writing to file '+o_target+'.absinth.\n\n')
-        
-        for d in D:
-            
-            f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
-            
-        f.close()
+    with Pool(4) as pool:
+        pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
+    #for key, value in topics.items():
+    #    WSI(key, value, results)