Skip to content
Snippets Groups Projects
Commit c24c8230 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Added max contraints on edges and nodes, as well as several print statements.

parent c91d862e
No related branches found
No related tags found
No related merge requests found
import os # for reading files
import sys
print('[A] Loading ' + sys.argv[0] + '.\n')
import spacy # for nlp
import networkx as nx # for visualisation
import matplotlib.pyplot as plt # for visualisation
from copy import deepcopy
import numpy as np # for calculations
nlp = spacy.load('en') # standard english nlp
try:
from tqdm import tqdm # for counting seconds
except:
tqdm = lambda x: x
def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4):
def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000):
node_freq = dict()
edge_freq = dict()
......@@ -19,7 +16,21 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N
files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
s_target = target.replace('_', ' ') #target word with spaces
for f in tqdm(files[:]):
i = 0
for f in files[:]:
if i % int(len(files[:])/23) == 0:
file_ratio = i/len(files[:])
max_node_ratio = len(node_freq)/max_nodes
max_edge_ratio = len(edge_freq)/max_edges
ratios = [file_ratio, max_node_ratio, max_edge_ratio]
print('~ {}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq)))
if len(node_freq) > max_nodes:
return node_freq, edge_freq
if len(edge_freq) > max_edges:
return node_freq, edge_freq
with open(f, 'r') as lines:
......@@ -58,7 +69,7 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N
else:
node_freq[token] = 1
for edge in {(x,y) for x in tokens for y in tokens if x != y}:
for edge in {(x,y) for x in tokens for y in tokens if x < y}:
if edge in edge_freq:
edge_freq[edge] += 1
......@@ -69,7 +80,9 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N
pass
#print('Failed to decode:', f)
i += 1
return node_freq, edge_freq
......@@ -77,12 +90,12 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
G = nx.Graph()
for key, value in tqdm(node_freq.items()):
for key, value in node_freq.items():
if value >= min_node_freq:
G.add_node(key)
for key, value in tqdm(edge_freq.items()):
for key, value in edge_freq.items():
if value < min_edge_freq:
continue
......@@ -97,7 +110,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
return G
def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
G = deepcopy(graph)
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
......@@ -109,7 +122,7 @@ def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
if G.degree[v] >= min_neighbors:
mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold:
......@@ -131,13 +144,14 @@ def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
#Components algorithm from Véronis (2004), converts graph for target into a MST
def components(graph, hubs, target):
G = deepcopy(graph)
H = hubs
t = target
G.add_node(t)
for h in H:
G.add_edge(t,h,weight=0)
#G.add_node(t)
#for h in H:
#G.add_edge(t,h,weight=0)
T = nx.minimum_spanning_tree(G)
......@@ -150,14 +164,14 @@ def components(graph, hubs, target):
#Uses MST to disambiguate context, should ideally write to evaluator format
def disambiguate(mst, hubs, contexts):
T = mst
H = hubs
i = 1
cluster = []
for v in list(T.nodes):
weights = []
scores = []
for h in H:
......@@ -168,27 +182,29 @@ def disambiguate(mst, hubs, contexts):
for i in range(1, len(path)):
total_weight += T[path[i-1]][path[i]]['weight']
weights.append(1/(1+total_weight))
scores.append(1/(1+total_weight))
except:
weights.append(0)
scores.append(0)
T.nodes[v]['s'] = np.array([w if w == max(weights) else 0 for w in weights])
T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
for c in contexts:
toks = [t.text for t in nlp(c)]
vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
idx = contexts.index(c) + 1
try:
cluster.append((np.argmax(vector), i))
cluster.append((np.argmax(vector), idx))
except:
cluster.append((len(H), i))
i += 1
cluster.append((len(H), idx))
return cluster
if __name__ == '__main__':
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
......@@ -196,59 +212,60 @@ if __name__ == '__main__':
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
results = dict()
with open(data_path+'/results.txt', 'r') as results_file:
for line in results_file.readlines()[1:]:
l = line.split('\t')
id1, _ = l[0].split('.')
if id1 not in results:
results[id1]=list()
results[id1].append(" ".join(l[2:]))
topics = dict()
with open(data_path+'/topics.txt', 'r') as topics_file:
for line in topics_file.readlines()[1:]:
l = line.split('\t')
topics[l[0]] = l[1]
with open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/test.txt', 'w') as clusters:
for key, value in topics.items():
target = value.strip()
print("[A] Processing '"+target+"'.\n")
clusters.write('subTopicID\tresultID\n')
f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
for key, value in tqdm(topics.items()):
target = value.strip()
print(target)
node_freq, edge_freq = frequencies(corpus_path, target)
G = build_graph(node_freq, edge_freq)
H = root_hubs(G, edge_freq)
T = components(G, H, target)
print('[A] Counting Tokens...')
node_freq, edge_freq = frequencies(corpus_path, target)
D = disambiguate(T, H, results[key])
print(D)
for d in D:
clusters.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
#target = sys.argv[1]
#node_freq, edge_freq = frequencies(corpus_path, target)
#G = build_graph(node_freq, edge_freq) #initialises graph
#H = root_hubs(G, edge_freq)
#T = components(G, H, target)
#print(node_freq)
print('\n[A] Building Graph.\n')
G = build_graph(node_freq, edge_freq)
print('[A] Collecting Root Hubs...')
H = root_hubs(G, edge_freq)
print('Root Hubs:', H, '\n')
print('[A] Building Minimum Spanning Tree.\n')
T = components(G, H, target)
#for node in deepcopy(T).nodes:
#if len(T.adj[node]) == 0:
#T.remove_node(node)
print('[A] Disambiguating Results...')
D = disambiguate(T, H, results[key])
print('Mapping:', D, '\n')
print('[A] Writing to file '+target+'.absinth.\n\n')
for d in D:
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
f.close()
#nx.draw(T, with_labels=True)
#plt.show()
#G.view()
#print(G.find_path('english', 'kennel'))
#G.draw() #draws graph
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment