Skip to content
Snippets Groups Projects
Commit b5581eb6 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Redone output, shuffle corpus, disambiguation output is now a dict

parent 642789bc
No related branches found
No related tags found
No related merge requests found
...@@ -8,13 +8,15 @@ import numpy as np # for calculations ...@@ -8,13 +8,15 @@ import numpy as np # for calculations
import config import config
import spacy # for nlp import spacy # for nlp
from multiprocessing import Pool from multiprocessing import Pool
import random
nlp = spacy.load('en') # standard english nlp nlp = spacy.load('en') # standard english nlp
#counts occurences of nodes and cooccurrences #counts occurences of nodes and cooccurrences
def frequencies(corpus_path, target): def frequencies(corpus_path, target):
random.seed(1)
stop_words = set(stopwords.words('english') + config.stop_words) stop_words = set(stopwords.words('english') + config.stop_words)
allowed_tags = config.allowed_tags allowed_tags = config.allowed_tags
min_context_size = config.min_context_size min_context_size = config.min_context_size
...@@ -24,13 +26,15 @@ def frequencies(corpus_path, target): ...@@ -24,13 +26,15 @@ def frequencies(corpus_path, target):
node_freq = dict() #counts (potential) nodes node_freq = dict() #counts (potential) nodes
edge_freq = dict() #counts (potential) edges edge_freq = dict() #counts (potential) edges
files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
s_target = target.replace('_', ' ') #target word with spaces s_target = target.replace('_', ' ') #target word with spaces
files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
random.shuffle(files)
i = 0 #for update print statements i = 0 #for update print statements
for f in files: for f in files:
if i % int(len(files)/10) == 0: #prints update after every 10th of the corpus is parsed if i % int(len(files)/11) == 0: #prints update after every 10th of the corpus is parsed
file_ratio = i/len(files[:]) file_ratio = i/len(files[:])
max_node_ratio = len(node_freq)/max_nodes max_node_ratio = len(node_freq)/max_nodes
...@@ -41,7 +45,7 @@ def frequencies(corpus_path, target): ...@@ -41,7 +45,7 @@ def frequencies(corpus_path, target):
#uses the ratio closest to 100%. #uses the ratio closest to 100%.
percentage = int((max(ratios))*100) percentage = int((max(ratios))*100)
print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target) print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq))+'\t('+target+')')
#checks maximum node values #checks maximum node values
if len(node_freq) > max_nodes: if len(node_freq) > max_nodes:
...@@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""): ...@@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""):
C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts
score_dict = dict() #memoisation for scores score_dict = dict() #memoisation for scores
result = list() #output of function mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
#if no sense is found for a target word, we should assume that there only is one sense
if len(H) == 0:
return {0:[i for i in range(1, len(C)+1)]}
for c in C: for c in C:
idx = C.index(c) + 1 #index based on position in list idx = C.index(c) + 1 #index based on position in list
doc = nlp(c) #parsed context
texts = [tok.text for tok in doc] #tokens
#if no sense is found for a target word, we should assume that there only is one sense scores = np.zeros(len(H)) #initialise with zeros for every sense
if len(H) == 0:
result.append((1, idx, 0))
else: for text in texts:
doc = nlp(c) #parsed context
texts = [tok.text for tok in doc] #tokens
scores = np.zeros(len(H)) #initialise with zeros for every sense if text in T.nodes: #if word wasn't filtered out
for text in texts:
if text in T.nodes: #if word wasn't filtered out new_scores = list() #scores to be added to total scores
new_scores = list() #scores to be added to total scores for h in H: #for each hub
for h in H: #for each hub if (text, h) in score_dict: #memoisation
if (text, h) in score_dict: #memoisation
new_scores.append(score_dict[(text,h)])
else: new_scores.append(score_dict[(text,h)])
new_score = score(T, text, h) else:
new_scores.append(new_score)
score_dict[(text,h)] = new_score #memoisation
scores = scores + np.array(new_scores) new_score = score(T, text, h)
new_scores.append(new_score)
else: score_dict[(text,h)] = new_score #memoisation
pass scores = scores + np.array(new_scores)
#if the disambiguator could not detect a sense, it should return a singleton, ie. nothing else:
if np.max(scores) == 0:
pass pass
#if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
if np.max(scores) == 0:
else: pass
#applies sense with the highest score to context else:
max_score = np.max(scores)
argmax_score = np.argmax(scores)
#clusters begin at 1 #applies sense with the highest score to context
result.append((argmax_score + 1, idx)) max_score = np.max(scores)
argmax_score = np.argmax(scores)
#clusters begin at 1
mapping_dict[argmax_score + 1].append(idx)
return result return mapping_dict
# our main function, here the main stepps for word sense induction are called # our main function, here the main stepps for word sense induction are called
...@@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results): ...@@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results):
f.write('subTopicID\tresultID\n') f.write('subTopicID\tresultID\n')
#counts occurences of single words, as well as cooccurrences, saves it in dictionary #counts occurences of single words, as well as cooccurrences, saves it in dictionary
print('[a]', 'Counting nodes and edges.', old_target) print('[a]', 'Counting nodes and edges.\t('+old_target+')')
node_freq, edge_freq = frequencies(corpus_path, target) node_freq, edge_freq = frequencies(corpus_path, target)
out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq))) out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq)))
#builds graph from these dictionaries, also applies multiple filters #builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.', old_target) print('[a]', 'Building graph.\t('+old_target+')')
G = build_graph(node_freq, edge_freq) G = build_graph(node_freq, edge_freq)
#finds root hubs (senses) within the graph + more filters for these #finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.', old_target) print('[a]', 'Collecting root hubs.\t('+old_target+')')
H = root_hubs(G, edge_freq) H = root_hubs(G, edge_freq)
out_buffer += '[A] Root hubs:\n' out_buffer += '[A] Root hubs:\n'
...@@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results): ...@@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results):
for h in H: for h in H:
mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6] mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
out_buffer += (' {}. {}: {}\n'.format(i, h, mfn)) out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
i += 1 i += 1
#performs minimum_spanning_tree algorithm on graph #performs minimum_spanning_tree algorithm on graph
print('[a]', 'Building minimum spanning tree.', old_target) print('[a]', 'Building minimum spanning tree.\t('+old_target+')')
T = components(G, H, target) T = components(G, H, target)
#matches senses to clusters #matches senses to clusters
print('[a]', 'Disambiguating results.', old_target) print('[a]', 'Disambiguating results.\t('+old_target+')')
D = disambiguate(T, H, results[topic_id], target) D = disambiguate(T, H, results[topic_id], target)
out_buffer += ('[A] Mapping: '+ str(D) + '\n')
out_buffer += ('[A] Mapping: \n')
for cluster,results in D.items():
out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
#prints buffer #prints buffer
print('[a]', 'Writing to file.', old_target) print('[a]', 'Writing to file.\t('+old_target+')')
print(out_buffer) print(out_buffer)
#writes clustering to file #writes clustering to file
for d in D: for cluster,results in D.items():
for result in results:
f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n') f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
f.close() f.close()
...@@ -394,13 +399,15 @@ if __name__ == '__main__': ...@@ -394,13 +399,15 @@ if __name__ == '__main__':
# topics.txt is a list of target words # topics.txt is a list of target words
topics = dict() topics = dict()
processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)]
with open(data_path+'topics.txt', 'r') as topics_file: with open(data_path+'topics.txt', 'r') as topics_file:
for line in topics_file.readlines()[1:]: for line in topics_file.readlines()[1:]:
l = line.split('\t') l = line.split('\t')
topics[l[0]] = l[1] if l[1].strip() not in processed_topics:
topics[l[0]] = l[1]
# multiprocessing # multiprocessing
with Pool(4) as pool: with Pool(4) as pool:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment