From 23854a3ef1a4f045172f7867418a8906e66fc150 Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Thu, 8 Mar 2018 18:06:18 +0100
Subject: [PATCH] Restructure disambiguate(), but not yet happy with results.

---
 code/absinth_nx.py | 110 +++++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 44 deletions(-)

diff --git a/code/absinth_nx.py b/code/absinth_nx.py
index ebb30c9..02fd7ae 100644
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -10,7 +10,7 @@ import numpy as np # for calculations
 nlp = spacy.load('en') # standard english nlp
 
 
-def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 4, max_nodes=100000, max_edges=10000000):
+def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000):
     
     node_freq = dict()
     edge_freq = dict()
@@ -19,9 +19,9 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
     s_target = target.replace('_', ' ') #target word with spaces
     
     i = 0
-    for f in files[:]:
+    for f in files:
         
-        if i % int(len(files[:])/23) == 0:
+        if i % int(len(files)/23) == 0:
             file_ratio = i/len(files[:])
             max_node_ratio = len(node_freq)/max_nodes
             max_edge_ratio = len(edge_freq)/max_edges
@@ -166,54 +166,79 @@ def components(graph, hubs, target):
     return T
 
 
-#Uses MST to disambiguate context, should ideally write to evaluator format
+def score(graph, from_node, to_node):
+    
+    if nx.has_path(graph, from_node, to_node):
+                
+        path = nx.shortest_path(graph, from_node, to_node, 'weight')
+        total_weight = 0
+    
+        for i in range(1, len(path)):
+            sub_from, sub_to = path[i-1], path[i]
+            total_weight += graph[sub_from][sub_to]['weight']
+    
+        return 1/(1+total_weight)
+        
+    else:
+        
+        return 0
+
+
 def disambiguate(mst, hubs, contexts):
     
     T = mst
     H = hubs
     C = [c.lower().strip() for c in contexts]
-    backup_cluster = len(H)
-    result = []
     
-    for v in list(T.nodes):
+    score_dict = dict()
+    result = list()
+
+    for c in C:
         
-        scores = []
+        idx = C.index(c) + 1
         
-        for h in H:
+        #if no sense is found for a target word, we should assume that there only is one sense
+        if len(H) == 0:
             
-            if nx.has_path(T,v,h):
-                
-                path = nx.shortest_path(T,v,h,'weight')
-                total_weight = 0
+            result.append((0, idx))
+        
+        else:
             
-                for i in range(1, len(path)):
-                    
-                    total_weight += T[path[i-1]][path[i]]['weight']
+            doc = nlp(c)
+            texts = [tok.text for tok in doc]
+            
+            scores = np.zeros(len(H)) #initialise with zeros for every sense
             
-                scores.append(1/(1+total_weight))
+            for text in texts:
                 
+                if text in T.nodes:
+                    
+                    new_scores = list()
+                    
+                    for h in H:
+                        if (text, h) in score_dict:
+                            new_scores.append(score_dict[(text,h)])
+                        else:
+                            new_score = score(T, text, h)
+                            new_scores.append(new_score)
+                            score_dict[(text,h)] = new_scores
+                        
+                    scores = np.add(scores, new_scores)
+                
+                else:
+                
+                    pass
+            
+            #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
+            if np.max(scores) == 0:
+            
+                pass
+            
             else:
                 
-                scores.append(0)
-        
-        T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
-    
-    for c in C:
-        
-        toks = [t.text for t in nlp(c)]
-        vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
-        
-        idx = C.index(c) + 1
-        
-        if len(vector) == 0: #if no senses are found -> all in one
-            result.append((0, idx))
-        elif max(vector) == 0: #if no sense matches -> singletons
-            pass
-        else: 
-            result.append((np.argmax(vector), idx))
-        
-    return result
+                result.append((np.argmax(scores), idx))
 
+    return result
 
 def backup(contexts):
     
@@ -225,10 +250,10 @@ if __name__ == '__main__':
     
     data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
     #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
+    corpus_path = '/proj/absinth/wikipedia_reduced/'
     results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
     
-    stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
+    stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
     
     results = dict()
     
@@ -251,7 +276,7 @@ if __name__ == '__main__':
         
         already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
         
-        for line in topics_file.readlines()[1:]:
+        for line in topics_file.readlines()[1:5]:
             
             l = line.split('\t')
             if l[1] not in already_processed:
@@ -279,9 +304,9 @@ if __name__ == '__main__':
         H = root_hubs(G, edge_freq)
         for h in H:
             mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
-            print(' {}: {}\n'.format(h, mfn))
+            print(' {}: {}'.format(h, mfn))
         
-        print('[A] Building Minimum Spanning Tree.\n')
+        print('\n[A] Building Minimum Spanning Tree.\n')
         T = components(G, H, target)
     
         print('[A] Disambiguating Results...')
@@ -294,6 +319,3 @@ if __name__ == '__main__':
             f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
             
         f.close()
-    
-    #nx.draw(T, with_labels=True)
-    #plt.show()
-- 
GitLab