From 5273a7b05632253a43d4c81d888ad8b3ca0be3cf Mon Sep 17 00:00:00 2001
From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Wed, 7 Mar 2018 18:14:32 +0100
Subject: [PATCH] More minor edits.

---
 code/absinth_nx.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/code/absinth_nx.py b/code/absinth_nx.py
index 0215edb..ebb30c9 100644
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -84,7 +84,8 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
                 #print('Failed to decode:', f)              
         
         i += 1
-        
+    
+    print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)))
     return node_freq, edge_freq
 
 
@@ -227,7 +228,7 @@ if __name__ == '__main__':
     corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
     results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
     
-    stop = set(stopwords.words('english') + ['utc', 'new', 'other'])
+    stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
     
     results = dict()
     
@@ -258,10 +259,12 @@ if __name__ == '__main__':
         
     for key, value in topics.items():
             
-        target = value.strip()
-        print("[A] Processing '"+target+"'.\n")
-        if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection
-            target = target[4:]
+        o_target = value.strip() #original target
+        print("[A] Processing '"+o_target+"'.\n")
+        if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
+            target = o_target[4:]
+        else:
+            target = o_target
         
         f = open(results_path+target+'.absinth', 'w')
         f.write('subTopicID\tresultID\n')
@@ -285,7 +288,7 @@ if __name__ == '__main__':
         D = disambiguate(T, H, results[key])
         print(' Mapping:', D, '\n')
         
-        print('[A] Writing to file '+target+'.absinth.\n\n')
+        print('[A] Writing to file '+o_target+'.absinth.\n\n')
         for d in D:
             
             f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
-- 
GitLab