Restructure disambiguate(), but not yet happy with results.

23854a3e · Victor Zimmermann · 5273a7b0 · 23854a3e
Commit 23854a3e authored 7 years ago by Victor Zimmermann
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -10,7 +10,7 @@ import numpy as np # for calculations
 nlp = spacy.load('en') # standard english nlp


-def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 4, max_nodes=100000, max_edges=10000000):
+def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000):
    
    node_freq = dict()
    edge_freq = dict()
@@ -19,9 +19,9 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
    s_target = target.replace('_', ' ') #target word with spaces
    
    i = 0
-    for f in files[:]:
+    for f in files:
        
-        if i % int(len(files[:])/23) == 0:
+        if i % int(len(files)/23) == 0:
            file_ratio = i/len(files[:])
            max_node_ratio = len(node_freq)/max_nodes
            max_edge_ratio = len(edge_freq)/max_edges
@@ -166,54 +166,79 @@ def components(graph, hubs, target):
    return T


-#Uses MST to disambiguate context, should ideally write to evaluator format
+def score(graph, from_node, to_node):
+    
+    if nx.has_path(graph, from_node, to_node):
+                
+        path = nx.shortest_path(graph, from_node, to_node, 'weight')
+        total_weight = 0
+    
+        for i in range(1, len(path)):
+            sub_from, sub_to = path[i-1], path[i]
+            total_weight += graph[sub_from][sub_to]['weight']
+    
+        return 1/(1+total_weight)
+        
+    else:
+        
+        return 0
+
+
 def disambiguate(mst, hubs, contexts):
    
    T = mst
    H = hubs
    C = [c.lower().strip() for c in contexts]
-    backup_cluster = len(H)
-    result = []
    
-    for v in list(T.nodes):
+    score_dict = dict()
+    result = list()
+
+    for c in C:
        
-        scores = []
+        idx = C.index(c) + 1
        
-        for h in H:
+        #if no sense is found for a target word, we should assume that there only is one sense
+        if len(H) == 0:
            
-            if nx.has_path(T,v,h):
-                
-                path = nx.shortest_path(T,v,h,'weight')
-                total_weight = 0
+            result.append((0, idx))
+        
+        else:
            
-                for i in range(1, len(path)):
-                    
-                    total_weight += T[path[i-1]][path[i]]['weight']
+            doc = nlp(c)
+            texts = [tok.text for tok in doc]
+            
+            scores = np.zeros(len(H)) #initialise with zeros for every sense
            
-                scores.append(1/(1+total_weight))
+            for text in texts:
                
+                if text in T.nodes:
+                    
+                    new_scores = list()
+                    
+                    for h in H:
+                        if (text, h) in score_dict:
+                            new_scores.append(score_dict[(text,h)])
+                        else:
+                            new_score = score(T, text, h)
+                            new_scores.append(new_score)
+                            score_dict[(text,h)] = new_scores
+                        
+                    scores = np.add(scores, new_scores)
+                
+                else:
+                
+                    pass
+            
+            #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
+            if np.max(scores) == 0:
+            
+                pass
+            
            else:
                
-                scores.append(0)
-        
-        T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
-    
-    for c in C:
-        
-        toks = [t.text for t in nlp(c)]
-        vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
-        
-        idx = C.index(c) + 1
-        
-        if len(vector) == 0: #if no senses are found -> all in one
-            result.append((0, idx))
-        elif max(vector) == 0: #if no sense matches -> singletons
-            pass
-        else: 
-            result.append((np.argmax(vector), idx))
-        
-    return result
+                result.append((np.argmax(scores), idx))

+    return result

 def backup(contexts):
    
@@ -225,10 +250,10 @@ if __name__ == '__main__':
    
    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
    #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
+    corpus_path = '/proj/absinth/wikipedia_reduced/'
    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
    
-    stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
+    stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
    
    results = dict()
    
@@ -251,7 +276,7 @@ if __name__ == '__main__':
        
        already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
        
-        for line in topics_file.readlines()[1:]:
+        for line in topics_file.readlines()[1:5]:
            
            l = line.split('\t')
            if l[1] not in already_processed:
@@ -279,9 +304,9 @@ if __name__ == '__main__':
        H = root_hubs(G, edge_freq)
        for h in H:
            mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
-            print(' {}: {}\n'.format(h, mfn))
+            print(' {}: {}'.format(h, mfn))
        
-        print('[A] Building Minimum Spanning Tree.\n')
+        print('\n[A] Building Minimum Spanning Tree.\n')
        T = components(G, H, target)
    
        print('[A] Disambiguating Results...')
@@ -294,6 +319,3 @@ if __name__ == '__main__':
            f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
            
        f.close()
-    
-    #nx.draw(T, with_labels=True)
-    #plt.show()