Skip to content
Snippets Groups Projects
Commit 5273a7b0 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

More minor edits.

parent 32cac421
No related branches found
No related tags found
No related merge requests found
...@@ -84,7 +84,8 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ ...@@ -84,7 +84,8 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
#print('Failed to decode:', f) #print('Failed to decode:', f)
i += 1 i += 1
print(' 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)))
return node_freq, edge_freq return node_freq, edge_freq
...@@ -227,7 +228,7 @@ if __name__ == '__main__': ...@@ -227,7 +228,7 @@ if __name__ == '__main__':
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/' results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
stop = set(stopwords.words('english') + ['utc', 'new', 'other']) stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
results = dict() results = dict()
...@@ -258,10 +259,12 @@ if __name__ == '__main__': ...@@ -258,10 +259,12 @@ if __name__ == '__main__':
for key, value in topics.items(): for key, value in topics.items():
target = value.strip() o_target = value.strip() #original target
print("[A] Processing '"+target+"'.\n") print("[A] Processing '"+o_target+"'.\n")
if target[:4] == 'the_' and target.count('_') >= 2: #hard coded 'the'-protection if o_target[:4] == 'the_' and o_target.count('_') >= 2: #hard coded 'the'-protection
target = target[4:] target = o_target[4:]
else:
target = o_target
f = open(results_path+target+'.absinth', 'w') f = open(results_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n') f.write('subTopicID\tresultID\n')
...@@ -285,7 +288,7 @@ if __name__ == '__main__': ...@@ -285,7 +288,7 @@ if __name__ == '__main__':
D = disambiguate(T, H, results[key]) D = disambiguate(T, H, results[key])
print(' Mapping:', D, '\n') print(' Mapping:', D, '\n')
print('[A] Writing to file '+target+'.absinth.\n\n') print('[A] Writing to file '+o_target+'.absinth.\n\n')
for d in D: for d in D:
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment