From 201549dd7dde89f64ab5db6c10aba83d60302b81 Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Tue, 6 Mar 2018 16:23:09 +0100
Subject: [PATCH] Added backup dummy (and singleton clustering if no sense
 matches or no sense was found for the time being).

---
 code/absinth_nx.py | 44 +++++++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 13 deletions(-)

diff --git a/code/absinth_nx.py b/code/absinth_nx.py
index 54667b4..57cae2f 100644
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -8,6 +8,7 @@ from copy import deepcopy
 import numpy as np # for calculations
 nlp = spacy.load('en') # standard english nlp
 
+
 def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000):
     
     node_freq = dict()
@@ -142,6 +143,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
     
     return H
 
+
 #Components algorithm from Véronis (2004), converts graph for target into a MST
 def components(graph, hubs, target):
     
@@ -167,7 +169,8 @@ def disambiguate(mst, hubs, contexts):
     
     T = mst
     H = hubs
-    cluster = []
+    backup_cluster = len(H)
+    result = []
     
     for v in list(T.nodes):
         
@@ -196,24 +199,36 @@ def disambiguate(mst, hubs, contexts):
         
         idx = contexts.index(c) + 1
         
-        try:
-            cluster.append((np.argmax(vector), idx))
-        except:
-            cluster.append((len(H), idx))
-    
-    return cluster
+        if max(vector) == 0:
+            result.append((backup_cluster, idx))
+            backup_cluster += 1
+        else:
+            try:
+                cluster = np.argmax(vector)
+                result.append((cluster, idx))
+            except:
+                result.append((backup_cluster, idx))
+                backup_cluster += 1
+        
+    return result
 
 
+def backup(contexts):
+    
+    pass
+        
+        
 
 if __name__ == '__main__':
     
-    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
+    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
     #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
-    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
+    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
+    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'
     
     results = dict()
     
-    with open(data_path+'/results.txt', 'r') as results_file:
+    with open(data_path+'results.txt', 'r') as results_file:
         
         for line in results_file.readlines()[1:]:
             
@@ -228,19 +243,22 @@ if __name__ == '__main__':
             
     topics = dict()
     
-    with open(data_path+'/topics.txt', 'r') as topics_file:
+    with open(data_path+'topics.txt', 'r') as topics_file:
+        
+        already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
         
         for line in topics_file.readlines()[1:]:
             
             l = line.split('\t')
-            topics[l[0]] = l[1]
+            if l[1] not in already_processed:
+                topics[l[0]] = l[1]
         
     for key, value in topics.items():
             
         target = value.strip()
         print("[A] Processing '"+target+"'.\n")
         
-        f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w')
+        f = open(results_path+target+'.absinth', 'w')
         f.write('subTopicID\tresultID\n')
         
         print('[A] Counting Tokens...')
-- 
GitLab