diff --git a/code/absinth_nx.py b/code/absinth_nx.py index 0215edbd10daae94ffebcf786328eae9b400721a..ebb30c95e6fe72b2bd8dc443092a7a896f0c3d02 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -84,7 +84,8 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ #print('Failed to decode:', f) i += 1 - + + print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))) return node_freq, edge_freq @@ -227,7 +228,7 @@ if __name__ == '__main__': corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/' - stop = set(stopwords.words('english') + ['utc', 'new', 'other']) + stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']) results = dict() @@ -258,10 +259,12 @@ if __name__ == '__main__': for key, value in topics.items(): - target = value.strip() - print("[A] Processing '"+target+"'.\n") - if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection - target = target[4:] + o_target = value.strip() #original target + print("[A] Processing '"+o_target+"'.\n") + if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection + target = o_target[4:] + else: + target = o_target f = open(results_path+target+'.absinth', 'w') f.write('subTopicID\tresultID\n') @@ -285,7 +288,7 @@ if __name__ == '__main__': D = disambiguate(T, H, results[key]) print(' Mapping:', D, '\n') - print('[A] Writing to file '+target+'.absinth.\n\n') + print('[A] Writing to file '+o_target+'.absinth.\n\n') for d in D: f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')