diff --git a/code/absinth_nx.py b/code/absinth_nx.py index ebb30c95e6fe72b2bd8dc443092a7a896f0c3d02..02fd7ae2a74b79983b76fedb24c2e70cdc22abd5 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -10,7 +10,7 @@ import numpy as np # for calculations nlp = spacy.load('en') # standard english nlp -def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 4, max_nodes=100000, max_edges=10000000): +def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000): node_freq = dict() edge_freq = dict() @@ -19,9 +19,9 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ s_target = target.replace('_', ' ') #target word with spaces i = 0 - for f in files[:]: + for f in files: - if i % int(len(files[:])/23) == 0: + if i % int(len(files)/23) == 0: file_ratio = i/len(files[:]) max_node_ratio = len(node_freq)/max_nodes max_edge_ratio = len(edge_freq)/max_edges @@ -166,54 +166,79 @@ def components(graph, hubs, target): return T -#Uses MST to disambiguate context, should ideally write to evaluator format +def score(graph, from_node, to_node): + + if nx.has_path(graph, from_node, to_node): + + path = nx.shortest_path(graph, from_node, to_node, 'weight') + total_weight = 0 + + for i in range(1, len(path)): + sub_from, sub_to = path[i-1], path[i] + total_weight += graph[sub_from][sub_to]['weight'] + + return 1/(1+total_weight) + + else: + + return 0 + + def disambiguate(mst, hubs, contexts): T = mst H = hubs C = [c.lower().strip() for c in contexts] - backup_cluster = len(H) - result = [] - for v in list(T.nodes): + score_dict = dict() + result = list() + + for c in C: - scores = [] + idx = C.index(c) + 1 - for h in H: + #if no sense is found for a target word, we should assume that there only is one sense + if len(H) == 0: - if nx.has_path(T,v,h): - - path = nx.shortest_path(T,v,h,'weight') - total_weight = 0 + result.append((0, idx)) + + else: - for i in range(1, len(path)): - - total_weight += T[path[i-1]][path[i]]['weight'] + doc = nlp(c) + texts = [tok.text for tok in doc] + + scores = np.zeros(len(H)) #initialise with zeros for every sense - scores.append(1/(1+total_weight)) + for text in texts: + if text in T.nodes: + + new_scores = list() + + for h in H: + if (text, h) in score_dict: + new_scores.append(score_dict[(text,h)]) + else: + new_score = score(T, text, h) + new_scores.append(new_score) + score_dict[(text,h)] = new_scores + + scores = np.add(scores, new_scores) + + else: + + pass + + #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing + if np.max(scores) == 0: + + pass + else: - scores.append(0) - - T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores]) - - for c in C: - - toks = [t.text for t in nlp(c)] - vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0) - - idx = C.index(c) + 1 - - if len(vector) == 0: #if no senses are found -> all in one - result.append((0, idx)) - elif max(vector) == 0: #if no sense matches -> singletons - pass - else: - result.append((np.argmax(vector), idx)) - - return result + result.append((np.argmax(scores), idx)) + return result def backup(contexts): @@ -225,10 +250,10 @@ if __name__ == '__main__': data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/' #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' - corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' + corpus_path = '/proj/absinth/wikipedia_reduced/' results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/' - stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']) + stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free']) results = dict() @@ -251,7 +276,7 @@ if __name__ == '__main__': already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)] - for line in topics_file.readlines()[1:]: + for line in topics_file.readlines()[1:5]: l = line.split('\t') if l[1] not in already_processed: @@ -279,9 +304,9 @@ if __name__ == '__main__': H = root_hubs(G, edge_freq) for h in H: mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6] - print(' {}: {}\n'.format(h, mfn)) + print(' {}: {}'.format(h, mfn)) - print('[A] Building Minimum Spanning Tree.\n') + print('\n[A] Building Minimum Spanning Tree.\n') T = components(G, H, target) print('[A] Disambiguating Results...') @@ -294,6 +319,3 @@ if __name__ == '__main__': f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') f.close() - - #nx.draw(T, with_labels=True) - #plt.show()