Skip to content
Snippets Groups Projects
Commit 5273a7b0 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

More minor edits.

parent 32cac421
No related branches found
No related tags found
No related merge requests found
......@@ -84,7 +84,8 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
#print('Failed to decode:', f)
i += 1
print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)))
return node_freq, edge_freq
......@@ -227,7 +228,7 @@ if __name__ == '__main__':
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
stop = set(stopwords.words('english') + ['utc', 'new', 'other'])
stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
results = dict()
......@@ -258,10 +259,12 @@ if __name__ == '__main__':
for key, value in topics.items():
target = value.strip()
print("[A] Processing '"+target+"'.\n")
if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection
target = target[4:]
o_target = value.strip() #original target
print("[A] Processing '"+o_target+"'.\n")
if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
target = o_target[4:]
else:
target = o_target
f = open(results_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
......@@ -285,7 +288,7 @@ if __name__ == '__main__':
D = disambiguate(T, H, results[key])
print(' Mapping:', D, '\n')
print('[A] Writing to file '+target+'.absinth.\n\n')
print('[A] Writing to file '+o_target+'.absinth.\n\n')
for d in D:
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment