More minor edits.

5273a7b0 · Victor Zimmermann · 32cac421 · 5273a7b0
Commit 5273a7b0 authored 7 years ago by Victor Zimmermann
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
@@ -84,7 +84,8 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
                #print('Failed to decode:', f)              
        
        i += 1
-        
+    
+    print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)))
    return node_freq, edge_freq


@@ -227,7 +228,7 @@ if __name__ == '__main__':
    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
    results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
    
-    stop = set(stopwords.words('english') + ['utc', 'new', 'other'])
+    stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
    
    results = dict()
    
@@ -258,10 +259,12 @@ if __name__ == '__main__':
        
    for key, value in topics.items():
            
-        target = value.strip()
-        print("[A] Processing '"+target+"'.\n")
-        if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection
-            target = target[4:]
+        o_target = value.strip() #original target
+        print("[A] Processing '"+o_target+"'.\n")
+        if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
+            target = o_target[4:]
+        else:
+            target = o_target
        
        f = open(results_path+target+'.absinth', 'w')
        f.write('subTopicID\tresultID\n')
@@ -285,7 +288,7 @@ if __name__ == '__main__':
        D = disambiguate(T, H, results[key])
        print(' Mapping:', D, '\n')
        
-        print('[A] Writing to file '+target+'.absinth.\n\n')
+        print('[A] Writing to file '+o_target+'.absinth.\n\n')
        for d in D:
            
            f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')