From c91d862e8757dbf88d5bc7b2a937e2448df4bd77 Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Mon, 5 Mar 2018 21:45:28 +0100 Subject: [PATCH] Remade program with NetworkX, terrible results so far, but completely implemented --- code/absinth_nx.py | 254 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 code/absinth_nx.py diff --git a/code/absinth_nx.py b/code/absinth_nx.py new file mode 100644 index 0000000..5e8f724 --- /dev/null +++ b/code/absinth_nx.py @@ -0,0 +1,254 @@ +import os # for reading files +import sys +import spacy # for nlp +import networkx as nx # for visualisation +import matplotlib.pyplot as plt # for visualisation +from copy import deepcopy +import numpy as np # for calculations +nlp = spacy.load('en') # standard english nlp +try: + from tqdm import tqdm # for counting seconds +except: + tqdm = lambda x: x + +def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4): + + node_freq = dict() + edge_freq = dict() + + files = [corpus_path+'/'+f for f in os.listdir(corpus_path)] + s_target = target.replace('_', ' ') #target word with spaces + + for f in tqdm(files[:]): + + with open(f, 'r') as lines: + + try: + + for line in lines: + + line = line.lower() + + if s_target in line: + + tokens = set() + doc = nlp(line.replace(s_target, target)) + + if target in [t.text for t in doc]: + + for tok in doc: + + text = tok.text + tag = tok.tag_ + + if text == target: + pass + + elif text in stop_words: + pass + + elif tag in allowed_tags: + tokens.add(tok.text) + + if len(tokens) >= min_context_size: + for token in tokens: + + if token in node_freq: + node_freq[token] += 1 + else: + node_freq[token] = 1 + + for edge in {(x,y) for x in tokens for y in tokens if x != y}: + + if edge in edge_freq: + edge_freq[edge] += 1 + else: + edge_freq[edge] = 1 + + except UnicodeDecodeError: + + pass + #print('Failed to decode:', f) + + return node_freq, edge_freq + + +def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9): + + G = nx.Graph() + + for key, value in tqdm(node_freq.items()): + + if value >= min_node_freq: + G.add_node(key) + + for key, value in tqdm(edge_freq.items()): + + if value < min_edge_freq: + continue + + if key[0] not in G.nodes or key[1] not in G.nodes: + continue + + weight = 1 - max(edge_freq[key]/node_freq[key[0]], edge_freq[key]/node_freq[key[1]]) + if weight <= max_weight: + G.add_edge(*key, weight=weight) + + return G + + +def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8): + + G = deepcopy(graph) + V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...) + H = list() + + while V: + + v = V[0] + + if G.degree[v] >= min_neighbors: + + mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key], reverse=True)[:min_neighbors] #mfn: most frequent neighbors + + if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: + + H.append(v) + + for nbr in deepcopy(G).adj[v]: + + G.remove_node(nbr) + + G.remove_node(v) + + V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) + + else: + + return H + + return H + +#Components algorithm from Véronis (2004), converts graph for target into a MST +def components(graph, hubs, target): + G = deepcopy(graph) + H = hubs + t = target + + G.add_node(t) + for h in H: + G.add_edge(t,h,weight=0) + + T = nx.minimum_spanning_tree(G) + + for node in deepcopy(T).nodes: + if len(T.adj[node]) == 0: + T.remove_node(node) + + return T + + +#Uses MST to disambiguate context, should ideally write to evaluator format +def disambiguate(mst, hubs, contexts): + T = mst + H = hubs + i = 1 + cluster = [] + + for v in list(T.nodes): + + weights = [] + + for h in H: + + try: + path = nx.shortest_path(T,v,h,'weight') + total_weight = 0 + + for i in range(1, len(path)): + total_weight += T[path[i-1]][path[i]]['weight'] + + weights.append(1/(1+total_weight)) + + except: + weights.append(0) + + T.nodes[v]['s'] = np.array([w if w == max(weights) else 0 for w in weights]) + + for c in contexts: + + toks = [t.text for t in nlp(c)] + vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0) + + try: + cluster.append((np.argmax(vector), i)) + except: + cluster.append((len(H), i)) + + i += 1 + + return cluster + +if __name__ == '__main__': + + data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE' + #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' + corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI' + + results = dict() + with open(data_path+'/results.txt', 'r') as results_file: + for line in results_file.readlines()[1:]: + l = line.split('\t') + id1, _ = l[0].split('.') + if id1 not in results: + results[id1]=list() + results[id1].append(" ".join(l[2:])) + + topics = dict() + with open(data_path+'/topics.txt', 'r') as topics_file: + for line in topics_file.readlines()[1:]: + l = line.split('\t') + topics[l[0]] = l[1] + + with open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/test.txt', 'w') as clusters: + + clusters.write('subTopicID\tresultID\n') + + for key, value in tqdm(topics.items()): + + target = value.strip() + print(target) + node_freq, edge_freq = frequencies(corpus_path, target) + G = build_graph(node_freq, edge_freq) + H = root_hubs(G, edge_freq) + T = components(G, H, target) + + D = disambiguate(T, H, results[key]) + print(D) + for d in D: + clusters.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n') + + #target = sys.argv[1] + + #node_freq, edge_freq = frequencies(corpus_path, target) + + #G = build_graph(node_freq, edge_freq) #initialises graph + + #H = root_hubs(G, edge_freq) + + #T = components(G, H, target) + + #print(node_freq) + + #for node in deepcopy(T).nodes: + #if len(T.adj[node]) == 0: + #T.remove_node(node) + + #nx.draw(T, with_labels=True) + #plt.show() + + + + #G.view() + #print(G.find_path('english', 'kennel')) + #G.draw() #draws graph -- GitLab