diff --git a/code/absinth_nx.py b/code/absinth_nx.py index 5e8f7247927c677f20f503d2dc386228a266b8f5..54667b43ac97a8180cf8d24a46498c6b19307b97 100644 --- a/code/absinth_nx.py +++ b/code/absinth_nx.py @@ -1,17 +1,14 @@ import os # for reading files import sys +print('[A] Loading ' + sys.argv[0] + '.\n') import spacy # for nlp import networkx as nx # for visualisation import matplotlib.pyplot as plt # for visualisation from copy import deepcopy import numpy as np # for calculations nlp = spacy.load('en') # standard english nlp -try: - from tqdm import tqdm # for counting seconds -except: - tqdm = lambda x: x -def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4): +def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000): node_freq = dict() edge_freq = dict() @@ -19,7 +16,21 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] s_target = target.replace('_', ' ') #target word with spaces - for f in tqdm(files[:]): + i = 0 + for f in files[:]: + + if i % int(len(files[:])/23) == 0: + file_ratio = i/len(files[:]) + max_node_ratio = len(node_freq)/max_nodes + max_edge_ratio = len(edge_freq)/max_edges + ratios = [file_ratio, max_node_ratio, max_edge_ratio] + print('~ {}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq))) + + if len(node_freq) > max_nodes: + return node_freq, edge_freq + + if len(edge_freq) > max_edges: + return node_freq, edge_freq with open(f, 'r') as lines: @@ -58,7 +69,7 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N else: node_freq[token] = 1 - for edge in {(x,y) for x in tokens for y in tokens if x != y}: + for edge in {(x,y) for x in tokens for y in tokens if x < y}: if edge in edge_freq: edge_freq[edge] += 1 @@ -69,7 +80,9 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N pass #print('Failed to decode:', f) - + + i += 1 + return node_freq, edge_freq @@ -77,12 +90,12 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei G = nx.Graph() - for key, value in tqdm(node_freq.items()): + for key, value in node_freq.items(): if value >= min_node_freq: G.add_node(key) - for key, value in tqdm(edge_freq.items()): + for key, value in edge_freq.items(): if value < min_edge_freq: continue @@ -97,7 +110,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei return G -def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8): +def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8): G = deepcopy(graph) V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) @@ -109,7 +122,7 @@ def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8): if G.degree[v] >= min_neighbors: - mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key], reverse=True)[:min_neighbors] #mfn: most frequent neighbors + mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #mfn: most frequent neighbors if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: @@ -131,13 +144,14 @@ def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8): #Components algorithm from Véronis (2004), converts graph for target into a MST def components(graph, hubs, target): + G = deepcopy(graph) H = hubs t = target - G.add_node(t) - for h in H: - G.add_edge(t,h,weight=0) + #G.add_node(t) + #for h in H: + #G.add_edge(t,h,weight=0) T = nx.minimum_spanning_tree(G) @@ -150,14 +164,14 @@ def components(graph, hubs, target): #Uses MST to disambiguate context, should ideally write to evaluator format def disambiguate(mst, hubs, contexts): + T = mst H = hubs - i = 1 cluster = [] for v in list(T.nodes): - weights = [] + scores = [] for h in H: @@ -168,27 +182,29 @@ def disambiguate(mst, hubs, contexts): for i in range(1, len(path)): total_weight += T[path[i-1]][path[i]]['weight'] - weights.append(1/(1+total_weight)) + scores.append(1/(1+total_weight)) except: - weights.append(0) + scores.append(0) - T.nodes[v]['s'] = np.array([w if w == max(weights) else 0 for w in weights]) + T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores]) for c in contexts: toks = [t.text for t in nlp(c)] vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0) + idx = contexts.index(c) + 1 + try: - cluster.append((np.argmax(vector), i)) + cluster.append((np.argmax(vector), idx)) except: - cluster.append((len(H), i)) - - i += 1 + cluster.append((len(H), idx)) return cluster + + if __name__ == '__main__': data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE' @@ -196,59 +212,60 @@ if __name__ == '__main__': corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI' results = dict() + with open(data_path+'/results.txt', 'r') as results_file: + for line in results_file.readlines()[1:]: + l = line.split('\t') id1, _ = l[0].split('.') + if id1 not in results: results[id1]=list() + results[id1].append(" ".join(l[2:])) + topics = dict() + with open(data_path+'/topics.txt', 'r') as topics_file: + for line in topics_file.readlines()[1:]: + l = line.split('\t') topics[l[0]] = l[1] - with open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/test.txt', 'w') as clusters: + for key, value in topics.items(): + + target = value.strip() + print("[A] Processing '"+target+"'.\n") - clusters.write('subTopicID\tresultID\n') + f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w') + f.write('subTopicID\tresultID\n') - for key, value in tqdm(topics.items()): - - target = value.strip() - print(target) - node_freq, edge_freq = frequencies(corpus_path, target) - G = build_graph(node_freq, edge_freq) - H = root_hubs(G, edge_freq) - T = components(G, H, target) + print('[A] Counting Tokens...') + node_freq, edge_freq = frequencies(corpus_path, target) - D = disambiguate(T, H, results[key]) - print(D) - for d in D: - clusters.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') - - #target = sys.argv[1] - - #node_freq, edge_freq = frequencies(corpus_path, target) - - #G = build_graph(node_freq, edge_freq) #initialises graph - - #H = root_hubs(G, edge_freq) - - #T = components(G, H, target) - - #print(node_freq) + print('\n[A] Building Graph.\n') + G = build_graph(node_freq, edge_freq) + + print('[A] Collecting Root Hubs...') + H = root_hubs(G, edge_freq) + print('Root Hubs:', H, '\n') + + print('[A] Building Minimum Spanning Tree.\n') + T = components(G, H, target) - #for node in deepcopy(T).nodes: - #if len(T.adj[node]) == 0: - #T.remove_node(node) + print('[A] Disambiguating Results...') + D = disambiguate(T, H, results[key]) + print('Mapping:', D, '\n') + + print('[A] Writing to file '+target+'.absinth.\n\n') + for d in D: + + f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') + + f.close() #nx.draw(T, with_labels=True) #plt.show() - - - - #G.view() - #print(G.find_path('english', 'kennel')) - #G.draw() #draws graph