Skip to content
Snippets Groups Projects
Commit 201549dd authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Added backup dummy (and singleton clustering if no sense matches or no sense...

Added backup dummy (and singleton clustering if no sense matches or no sense was found for the time being).
parent c24c8230
No related branches found
No related tags found
No related merge requests found
......@@ -8,6 +8,7 @@ from copy import deepcopy
import numpy as np # for calculations
nlp = spacy.load('en') # standard english nlp
def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000):
node_freq = dict()
......@@ -142,6 +143,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
return H
#Components algorithm from Véronis (2004), converts graph for target into a MST
def components(graph, hubs, target):
......@@ -167,7 +169,8 @@ def disambiguate(mst, hubs, contexts):
T = mst
H = hubs
cluster = []
backup_cluster = len(H)
result = []
for v in list(T.nodes):
......@@ -196,24 +199,36 @@ def disambiguate(mst, hubs, contexts):
idx = contexts.index(c) + 1
try:
cluster.append((np.argmax(vector), idx))
except:
cluster.append((len(H), idx))
return cluster
if max(vector) == 0:
result.append((backup_cluster, idx))
backup_cluster += 1
else:
try:
cluster = np.argmax(vector)
result.append((cluster, idx))
except:
result.append((backup_cluster, idx))
backup_cluster += 1
return result
def backup(contexts):
pass
if __name__ == '__main__':
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'
results = dict()
with open(data_path+'/results.txt', 'r') as results_file:
with open(data_path+'results.txt', 'r') as results_file:
for line in results_file.readlines()[1:]:
......@@ -228,19 +243,22 @@ if __name__ == '__main__':
topics = dict()
with open(data_path+'/topics.txt', 'r') as topics_file:
with open(data_path+'topics.txt', 'r') as topics_file:
already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
for line in topics_file.readlines()[1:]:
l = line.split('\t')
topics[l[0]] = l[1]
if l[1] not in already_processed:
topics[l[0]] = l[1]
for key, value in topics.items():
target = value.strip()
print("[A] Processing '"+target+"'.\n")
f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w')
f = open(results_path+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
print('[A] Counting Tokens...')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment