From 201549dd7dde89f64ab5db6c10aba83d60302b81 Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Tue, 6 Mar 2018 16:23:09 +0100 Subject: [PATCH] Added backup dummy (and singleton clustering if no sense matches or no sense was found for the time being). --- code/absinth_nx.py | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/code/absinth_nx.py b/code/absinth_nx.py index 54667b4..57cae2f 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -8,6 +8,7 @@ from copy import deepcopy import numpy as np # for calculations nlp = spacy.load('en') # standard english nlp + def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000): node_freq = dict() @@ -142,6 +143,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): return H + #Components algorithm from Véronis (2004), converts graph for target into a MST def components(graph, hubs, target): @@ -167,7 +169,8 @@ def disambiguate(mst, hubs, contexts): T = mst H = hubs - cluster = [] + backup_cluster = len(H) + result = [] for v in list(T.nodes): @@ -196,24 +199,36 @@ def disambiguate(mst, hubs, contexts): idx = contexts.index(c) + 1 - try: - cluster.append((np.argmax(vector), idx)) - except: - cluster.append((len(H), idx)) - - return cluster + if max(vector) == 0: + result.append((backup_cluster, idx)) + backup_cluster += 1 + else: + try: + cluster = np.argmax(vector) + result.append((cluster, idx)) + except: + result.append((backup_cluster, idx)) + backup_cluster += 1 + + return result +def backup(contexts): + + pass + + if __name__ == '__main__': - data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE' + data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/' #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' - corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI' + corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' + results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/' results = dict() - with open(data_path+'/results.txt', 'r') as results_file: + with open(data_path+'results.txt', 'r') as results_file: for line in results_file.readlines()[1:]: @@ -228,19 +243,22 @@ if __name__ == '__main__': topics = dict() - with open(data_path+'/topics.txt', 'r') as topics_file: + with open(data_path+'topics.txt', 'r') as topics_file: + + already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)] for line in topics_file.readlines()[1:]: l = line.split('\t') - topics[l[0]] = l[1] + if l[1] not in already_processed: + topics[l[0]] = l[1] for key, value in topics.items(): target = value.strip() print("[A] Processing '"+target+"'.\n") - f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w') + f = open(results_path+target+'.absinth', 'w') f.write('subTopicID\tresultID\n') print('[A] Counting Tokens...') -- GitLab