diff --git a/code/absinth_nx.py b/code/absinth_nx.py index 14726b16367fac1c79645058c2cf3e6d2bdf0de6..ce072f4108031619b470b6a05b5479b72820b144 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -5,11 +5,12 @@ import spacy # for nlp import networkx as nx # for visualisation import matplotlib.pyplot as plt # for visualisation from copy import deepcopy +from nltk.corpus import stopwords import numpy as np # for calculations nlp = spacy.load('en') # standard english nlp -def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=100000, max_edges=10000000): +def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 4, max_nodes=100000, max_edges=10000000): node_freq = dict() edge_freq = dict() @@ -111,7 +112,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei return G -def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): +def root_hubs(graph, edge_freq, min_neighbors=5, theshold=0.8): G = deepcopy(graph) V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) @@ -178,16 +179,19 @@ def disambiguate(mst, hubs, contexts): for h in H: - try: + if nx.has_path(T,v,h): + path = nx.shortest_path(T,v,h,'weight') total_weight = 0 for i in range(1, len(path)): + total_weight += T[path[i-1]][path[i]]['weight'] scores.append(1/(1+total_weight)) - except: + else: + scores.append(0) T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores]) @@ -199,14 +203,12 @@ def disambiguate(mst, hubs, contexts): idx = contexts.index(c) + 1 - try: - if max(vector) == 0: - pass - else: - cluster = np.argmax(vector) - result.append((cluster, idx)) - except: + if len(vector) == 0: #if no senses are found -> all in one result.append((0, idx)) + elif max(vector) == 0: #if no sense matches -> singletons + pass + else: + result.append((np.argmax(vector), idx)) return result @@ -224,6 +226,8 @@ if __name__ == '__main__': corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/' results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/results/' + stop = set(stopwords.words('english') + ['utc', 'new', 'other']) + results = dict() with open(data_path+'results.txt', 'r') as results_file: @@ -260,14 +264,16 @@ if __name__ == '__main__': f.write('subTopicID\tresultID\n') print('[A] Counting Tokens...') - node_freq, edge_freq = frequencies(corpus_path, target) + node_freq, edge_freq = frequencies(corpus_path, target, stop) print('\n[A] Building Graph.\n') G = build_graph(node_freq, edge_freq) print('[A] Collecting Root Hubs...') H = root_hubs(G, edge_freq) - print('Root Hubs:', H, '\n') + for h in H: + mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6] + print('{}: {}\n'.format(h, mfn)) print('[A] Building Minimum Spanning Tree.\n') T = components(G, H, target)