Skip to content
Snippets Groups Projects
Commit c91d862e authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Remade program with NetworkX, terrible results so far, but completely implemented

parent 6232f19b
No related branches found
No related tags found
No related merge requests found
import os # for reading files
import sys
import spacy # for nlp
import networkx as nx # for visualisation
import matplotlib.pyplot as plt # for visualisation
from copy import deepcopy
import numpy as np # for calculations
nlp = spacy.load('en') # standard english nlp
try:
from tqdm import tqdm # for counting seconds
except:
tqdm = lambda x: x
def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4):
node_freq = dict()
edge_freq = dict()
files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
s_target = target.replace('_', ' ') #target word with spaces
for f in tqdm(files[:]):
with open(f, 'r') as lines:
try:
for line in lines:
line = line.lower()
if s_target in line:
tokens = set()
doc = nlp(line.replace(s_target, target))
if target in [t.text for t in doc]:
for tok in doc:
text = tok.text
tag = tok.tag_
if text == target:
pass
elif text in stop_words:
pass
elif tag in allowed_tags:
tokens.add(tok.text)
if len(tokens) >= min_context_size:
for token in tokens:
if token in node_freq:
node_freq[token] += 1
else:
node_freq[token] = 1
for edge in {(x,y) for x in tokens for y in tokens if x != y}:
if edge in edge_freq:
edge_freq[edge] += 1
else:
edge_freq[edge] = 1
except UnicodeDecodeError:
pass
#print('Failed to decode:', f)
return node_freq, edge_freq
def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9):
G = nx.Graph()
for key, value in tqdm(node_freq.items()):
if value >= min_node_freq:
G.add_node(key)
for key, value in tqdm(edge_freq.items()):
if value < min_edge_freq:
continue
if key[0] not in G.nodes or key[1] not in G.nodes:
continue
weight = 1 - max(edge_freq[key]/node_freq[key[0]], edge_freq[key]/node_freq[key[1]])
if weight <= max_weight:
G.add_edge(*key, weight=weight)
return G
def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
G = deepcopy(graph)
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
H = list()
while V:
v = V[0]
if G.degree[v] >= min_neighbors:
mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold:
H.append(v)
for nbr in deepcopy(G).adj[v]:
G.remove_node(nbr)
G.remove_node(v)
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True)
else:
return H
return H
#Components algorithm from Véronis (2004), converts graph for target into a MST
def components(graph, hubs, target):
G = deepcopy(graph)
H = hubs
t = target
G.add_node(t)
for h in H:
G.add_edge(t,h,weight=0)
T = nx.minimum_spanning_tree(G)
for node in deepcopy(T).nodes:
if len(T.adj[node]) == 0:
T.remove_node(node)
return T
#Uses MST to disambiguate context, should ideally write to evaluator format
def disambiguate(mst, hubs, contexts):
T = mst
H = hubs
i = 1
cluster = []
for v in list(T.nodes):
weights = []
for h in H:
try:
path = nx.shortest_path(T,v,h,'weight')
total_weight = 0
for i in range(1, len(path)):
total_weight += T[path[i-1]][path[i]]['weight']
weights.append(1/(1+total_weight))
except:
weights.append(0)
T.nodes[v]['s'] = np.array([w if w == max(weights) else 0 for w in weights])
for c in contexts:
toks = [t.text for t in nlp(c)]
vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
try:
cluster.append((np.argmax(vector), i))
except:
cluster.append((len(H), i))
i += 1
return cluster
if __name__ == '__main__':
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
results = dict()
with open(data_path+'/results.txt', 'r') as results_file:
for line in results_file.readlines()[1:]:
l = line.split('\t')
id1, _ = l[0].split('.')
if id1 not in results:
results[id1]=list()
results[id1].append(" ".join(l[2:]))
topics = dict()
with open(data_path+'/topics.txt', 'r') as topics_file:
for line in topics_file.readlines()[1:]:
l = line.split('\t')
topics[l[0]] = l[1]
with open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/test.txt', 'w') as clusters:
clusters.write('subTopicID\tresultID\n')
for key, value in tqdm(topics.items()):
target = value.strip()
print(target)
node_freq, edge_freq = frequencies(corpus_path, target)
G = build_graph(node_freq, edge_freq)
H = root_hubs(G, edge_freq)
T = components(G, H, target)
D = disambiguate(T, H, results[key])
print(D)
for d in D:
clusters.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
#target = sys.argv[1]
#node_freq, edge_freq = frequencies(corpus_path, target)
#G = build_graph(node_freq, edge_freq) #initialises graph
#H = root_hubs(G, edge_freq)
#T = components(G, H, target)
#print(node_freq)
#for node in deepcopy(T).nodes:
#if len(T.adj[node]) == 0:
#T.remove_node(node)
#nx.draw(T, with_labels=True)
#plt.show()
#G.view()
#print(G.find_path('english', 'kennel'))
#G.draw() #draws graph
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment