Added backup dummy (and singleton clustering if no sense matches or no sense...

Added backup dummy (and singleton clustering if no sense matches or no sense was found for the time being).

Added backup dummy (and singleton clustering if no sense matches or no sense...
201549dd · Victor Zimmermann · c24c8230 · 201549dd
Commit 201549dd authored 7 years ago by Victor Zimmermann
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -8,6 +8,7 @@ from copy import deepcopy
 import numpy as np # for calculations
 nlp = spacy.load('en') # standard english nlp

+
 def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000):
    
    node_freq = dict()
@@ -142,6 +143,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
    
    return H

+
 #Components algorithm from Véronis (2004), converts graph for target into a MST
 def components(graph, hubs, target):
    
@@ -167,7 +169,8 @@ def disambiguate(mst, hubs, contexts):
    
    T = mst
    H = hubs
-    cluster = []
+    backup_cluster = len(H)
+    result = []
    
    for v in list(T.nodes):
        
@@ -196,24 +199,36 @@ def disambiguate(mst, hubs, contexts):
        
        idx = contexts.index(c) + 1
        
-        try:
-            cluster.append((np.argmax(vector), idx))
-        except:
-            cluster.append((len(H), idx))
-    
-    return cluster
+        if max(vector) == 0:
+            result.append((backup_cluster, idx))
+            backup_cluster += 1
+        else:
+            try:
+                cluster = np.argmax(vector)
+                result.append((cluster, idx))
+            except:
+                result.append((backup_cluster, idx))
+                backup_cluster += 1
+        
+    return result


+def backup(contexts):
+    
+    pass
+        
+        

 if __name__ == '__main__':
    
-    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
+    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
    #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
+    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
+    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'
    
    results = dict()
    
-    with open(data_path+'/results.txt', 'r') as results_file:
+    with open(data_path+'results.txt', 'r') as results_file:
        
        for line in results_file.readlines()[1:]:
            
@@ -228,19 +243,22 @@ if __name__ == '__main__':
            
    topics = dict()
    
-    with open(data_path+'/topics.txt', 'r') as topics_file:
+    with open(data_path+'topics.txt', 'r') as topics_file:
+        
+        already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
        
        for line in topics_file.readlines()[1:]:
            
            l = line.split('\t')
-            topics[l[0]] = l[1]
+            if l[1] not in already_processed:
+                topics[l[0]] = l[1]
        
    for key, value in topics.items():
            
        target = value.strip()
        print("[A] Processing '"+target+"'.\n")
        
-        f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w')
+        f = open(results_path+target+'.absinth', 'w')
        f.write('subTopicID\tresultID\n')
        
        print('[A] Counting Tokens...')