Skip to content
Snippets Groups Projects
Commit 23854a3e authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Restructure disambiguate(), but not yet happy with results.

parent 5273a7b0
No related branches found
No related tags found
No related merge requests found
......@@ -10,7 +10,7 @@ import numpy as np # for calculations
nlp = spacy.load('en') # standard english nlp
def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 4, max_nodes=100000, max_edges=10000000):
def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP','VBZ','VBG'], min_context_size = 2, max_nodes=100000, max_edges=10000000):
node_freq = dict()
edge_freq = dict()
......@@ -19,9 +19,9 @@ def frequencies(corpus_path, target, stop_words=[], allowed_tags=['NN','NNS','JJ
s_target = target.replace('_', ' ') #target word with spaces
i = 0
for f in files[:]:
for f in files:
if i % int(len(files[:])/23) == 0:
if i % int(len(files)/23) == 0:
file_ratio = i/len(files[:])
max_node_ratio = len(node_freq)/max_nodes
max_edge_ratio = len(edge_freq)/max_edges
......@@ -166,54 +166,79 @@ def components(graph, hubs, target):
return T
#Uses MST to disambiguate context, should ideally write to evaluator format
def score(graph, from_node, to_node):
if nx.has_path(graph, from_node, to_node):
path = nx.shortest_path(graph, from_node, to_node, 'weight')
total_weight = 0
for i in range(1, len(path)):
sub_from, sub_to = path[i-1], path[i]
total_weight += graph[sub_from][sub_to]['weight']
return 1/(1+total_weight)
else:
return 0
def disambiguate(mst, hubs, contexts):
T = mst
H = hubs
C = [c.lower().strip() for c in contexts]
backup_cluster = len(H)
result = []
for v in list(T.nodes):
score_dict = dict()
result = list()
for c in C:
scores = []
idx = C.index(c) + 1
for h in H:
#if no sense is found for a target word, we should assume that there only is one sense
if len(H) == 0:
if nx.has_path(T,v,h):
path = nx.shortest_path(T,v,h,'weight')
total_weight = 0
result.append((0, idx))
else:
for i in range(1, len(path)):
total_weight += T[path[i-1]][path[i]]['weight']
doc = nlp(c)
texts = [tok.text for tok in doc]
scores = np.zeros(len(H)) #initialise with zeros for every sense
scores.append(1/(1+total_weight))
for text in texts:
if text in T.nodes:
new_scores = list()
for h in H:
if (text, h) in score_dict:
new_scores.append(score_dict[(text,h)])
else:
new_score = score(T, text, h)
new_scores.append(new_score)
score_dict[(text,h)] = new_scores
scores = np.add(scores, new_scores)
else:
pass
#if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
if np.max(scores) == 0:
pass
else:
scores.append(0)
T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
for c in C:
toks = [t.text for t in nlp(c)]
vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
idx = C.index(c) + 1
if len(vector) == 0: #if no senses are found -> all in one
result.append((0, idx))
elif max(vector) == 0: #if no sense matches -> singletons
pass
else:
result.append((np.argmax(vector), idx))
return result
result.append((np.argmax(scores), idx))
return result
def backup(contexts):
......@@ -225,10 +250,10 @@ if __name__ == '__main__':
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE/'
#corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI/'
corpus_path = '/proj/absinth/wikipedia_reduced/'
results_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/clustering/'
stop = set(stopwords.words('english') + ['utc', 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
stop = set(stopwords.words('english') + ['utc', "'s", 'new', 'other', 'talk', 'wikipedia', 'article', 'topic', 'page', 'editors', 'encyclopedia', 'free'])
results = dict()
......@@ -251,7 +276,7 @@ if __name__ == '__main__':
already_processed = [f.replace('.absinth', '') for f in os.listdir(results_path)]
for line in topics_file.readlines()[1:]:
for line in topics_file.readlines()[1:5]:
l = line.split('\t')
if l[1] not in already_processed:
......@@ -279,9 +304,9 @@ if __name__ == '__main__':
H = root_hubs(G, edge_freq)
for h in H:
mfn = sorted(G.adj[h], key=lambda key: edge_freq[h,key] if h < key else edge_freq[key, h], reverse=True)[:6]
print(' {}: {}\n'.format(h, mfn))
print(' {}: {}'.format(h, mfn))
print('[A] Building Minimum Spanning Tree.\n')
print('\n[A] Building Minimum Spanning Tree.\n')
T = components(G, H, target)
print('[A] Disambiguating Results...')
......@@ -294,6 +319,3 @@ if __name__ == '__main__':
f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
f.close()
#nx.draw(T, with_labels=True)
#plt.show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment