From b5581eb648aaabce6c53d73462b00389f554fdf7 Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Mon, 12 Mar 2018 18:08:48 +0100
Subject: [PATCH] Redone output, shuffle corpus, disambiguation output is now a
 dict

---
 src/absinth.py | 121 ++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 57 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index 144cb20..00f49f7 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -8,13 +8,15 @@ import numpy as np # for calculations
 import config
 import spacy # for nlp
 from multiprocessing import Pool
+import random
 
 nlp = spacy.load('en') # standard english nlp
 
-
 #counts occurences of nodes and cooccurrences
 def frequencies(corpus_path, target):
     
+    random.seed(1)
+    
     stop_words = set(stopwords.words('english') + config.stop_words)
     allowed_tags = config.allowed_tags
     min_context_size = config.min_context_size
@@ -24,13 +26,15 @@ def frequencies(corpus_path, target):
     node_freq = dict() #counts (potential) nodes
     edge_freq = dict() #counts (potential) edges
     
-    files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
     s_target = target.replace('_', ' ') #target word with spaces
+    files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
+    
+    random.shuffle(files)
     
     i = 0 #for update print statements
     for f in files:
         
-        if i % int(len(files)/10) == 0: #prints update after every 10th of the corpus is parsed
+        if i % int(len(files)/11) == 0: #prints update after every 10th of the corpus is parsed
             
             file_ratio = i/len(files[:])
             max_node_ratio = len(node_freq)/max_nodes
@@ -41,7 +45,7 @@ def frequencies(corpus_path, target):
             #uses the ratio closest to 100%.
             percentage = int((max(ratios))*100)
             
-            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target)
+            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq))+'\t('+target+')')
         
         #checks maximum node values
         if len(node_freq) > max_nodes:
@@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""):
     C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts
     
     score_dict = dict() #memoisation for scores
-    result = list() #output of function
-
+    mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
+    
+    #if no sense is found for a target word, we should assume that there only is one sense
+    if len(H) == 0: 
+            
+        return {0:[i for i in range(1, len(C)+1)]}
+    
     for c in C:
         
         idx = C.index(c) + 1 #index based on position in list
+    
+        doc = nlp(c) #parsed context
+        texts = [tok.text for tok in doc] #tokens
         
-        #if no sense is found for a target word, we should assume that there only is one sense
-        if len(H) == 0: 
-            
-            result.append((1, idx, 0))
+        scores = np.zeros(len(H)) #initialise with zeros for every sense
         
-        else:
-            
-            doc = nlp(c) #parsed context
-            texts = [tok.text for tok in doc] #tokens
+        for text in texts:
             
-            scores = np.zeros(len(H)) #initialise with zeros for every sense
-            
-            for text in texts:
+            if text in T.nodes: #if word wasn't filtered out
                 
-                if text in T.nodes: #if word wasn't filtered out
-                    
-                    new_scores = list() #scores to be added to total scores
+                new_scores = list() #scores to be added to total scores
+                
+                for h in H: #for each hub
                     
-                    for h in H: #for each hub
-                        
-                        if (text, h) in score_dict: #memoisation
-                            
-                            new_scores.append(score_dict[(text,h)])
+                    if (text, h) in score_dict: #memoisation
                         
-                        else:
-                            
-                            new_score = score(T, text, h)
-                            new_scores.append(new_score)
-                            score_dict[(text,h)] = new_score #memoisation
+                        new_scores.append(score_dict[(text,h)])
+                    
+                    else:
                         
-                    scores = scores + np.array(new_scores)
-                
-                else:
-                
-                    pass
+                        new_score = score(T, text, h)
+                        new_scores.append(new_score)
+                        score_dict[(text,h)] = new_score #memoisation
+                    
+                scores = scores + np.array(new_scores)
             
-            #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
-            if np.max(scores) == 0:
+            else:
             
                 pass
+        
+            #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
+        if np.max(scores) == 0:
             
-            else:
-                
-                #applies sense with the highest score to context
-                max_score = np.max(scores)
-                argmax_score = np.argmax(scores)
+            pass
+            
+        else:
                 
-                #clusters begin at 1
-                result.append((argmax_score + 1, idx))
+            #applies sense with the highest score to context
+            max_score = np.max(scores)
+            argmax_score = np.argmax(scores)
+            
+            #clusters begin at 1
+            mapping_dict[argmax_score + 1].append(idx)
 
-    return result
+    return mapping_dict
 
 
 # our main function, here the main stepps for word sense induction are called
@@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results):
     f.write('subTopicID\tresultID\n')
     
     #counts occurences of single words, as well as cooccurrences, saves it in dictionary
-    print('[a]', 'Counting nodes and edges.', old_target)
+    print('[a]', 'Counting nodes and edges.\t('+old_target+')')
     node_freq, edge_freq = frequencies(corpus_path, target)
     out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq)))
     
     #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.', old_target)
+    print('[a]', 'Building graph.\t('+old_target+')')
     G = build_graph(node_freq, edge_freq)
     
     #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.', old_target)
+    print('[a]', 'Collecting root hubs.\t('+old_target+')')
     H = root_hubs(G, edge_freq)
     out_buffer += '[A] Root hubs:\n'
     
@@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results):
     for h in H:
         
         mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
-        out_buffer += (' {}. {}: {}\n'.format(i, h, mfn))
+        out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
         i += 1
     
     #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.', old_target)
+    print('[a]', 'Building minimum spanning tree.\t('+old_target+')')
     T = components(G, H, target)
 
     #matches senses to clusters
-    print('[a]', 'Disambiguating results.', old_target)
+    print('[a]', 'Disambiguating results.\t('+old_target+')')
     D = disambiguate(T, H, results[topic_id], target)
-    out_buffer += ('[A] Mapping: '+ str(D) + '\n')
+    
+    out_buffer += ('[A] Mapping: \n')
+    for cluster,results in D.items():
+        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
     
     #prints buffer
-    print('[a]', 'Writing to file.', old_target)
+    print('[a]', 'Writing to file.\t('+old_target+')')
     print(out_buffer)
     
     #writes clustering to file
-    for d in D:
-        
-        f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n')
+    for cluster,results in D.items():
+        for result in results:
+            f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
         
     f.close()
     
@@ -394,13 +399,15 @@ if __name__ == '__main__':
     
     # topics.txt is a list of target words
     topics = dict()
+    processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)]
     
     with open(data_path+'topics.txt', 'r') as topics_file:
         
         for line in topics_file.readlines()[1:]:
             
             l = line.split('\t')
-            topics[l[0]] = l[1]
+            if l[1].strip() not in processed_topics:
+                topics[l[0]] = l[1]
     
     # multiprocessing
     with Pool(4) as pool:
-- 
GitLab