From 306e1df93ecb317221d38b193eed6e54d2b6d5b3 Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Wed, 7 Mar 2018 15:44:01 +0100
Subject: [PATCH] =?UTF-8?q?Add=20Kleinschei=C3=9F.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 code/absinth_nx.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/code/absinth_nx.py b/code/absinth_nx.py
index 3b97902..0215edb 100644
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -112,7 +112,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
     return G
 
 
-def root_hubs(graph, edge_freq, min_neighbors=5, theshold=0.8):
+def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
     
     G = deepcopy(graph)
     V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
@@ -170,6 +170,7 @@ def disambiguate(mst, hubs, contexts):
     
     T = mst
     H = hubs
+    C = [c.lower().strip() for c in contexts]
     backup_cluster = len(H)
     result = []
     
@@ -196,13 +197,12 @@ def disambiguate(mst, hubs, contexts):
         
         T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
     
-    for c in contexts:
+    for c in C:
         
-        c = c.lower()
         toks = [t.text for t in nlp(c)]
         vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
         
-        idx = contexts.index(c) + 1
+        idx = C.index(c) + 1
         
         if len(vector) == 0: #if no senses are found -> all in one
             result.append((0, idx))
@@ -225,7 +225,7 @@ if __name__ == '__main__':
     data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
     #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
     corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
-    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'
+    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
     
     stop = set(stopwords.words('english') + ['utc', 'new', 'other'])
     
@@ -260,7 +260,8 @@ if __name__ == '__main__':
             
         target = value.strip()
         print("[A] Processing '"+target+"'.\n")
-        
+        if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection
+            target = target[4:]
         
         f = open(results_path+target+'.absinth', 'w')
         f.write('subTopicID\tresultID\n')
-- 
GitLab