Skip to content
Snippets Groups Projects
Commit a2e527eb authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Commenting, restructuring

parent 6ba4aad4
No related branches found
No related tags found
No related merge requests found
......@@ -415,6 +415,59 @@ def score(graph, component, root_hub_list):
return score_array
def induce(topic_name, result_list):
"""
"""
statistics = dict()
#removes trailing new_lines
old_target_string = topic_name.strip() #original target
if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
return None
statistics['target'] = old_target_string
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
target_string = old_target_string[4:]
else:
target_string = old_target_string
#counts occurences of single words, as well as cooccurrences, saves it in dictionary
print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
#builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+old_target_string+')')
G = build_graph(node_freq_dict, edge_freq_dict)
statistics['node count'] = len(G.nodes)
statistics['edge count'] = len(G.edges)
#finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
H = root_hubs(G, edge_freq_dict)
#adds sense inventory to buffer with some common neighbors for context
statistics['hubs'] = dict()
for h in H:
mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
statistics['hubs'][h] = mfn
#performs minimum_spanning_tree algorithm on graph
print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
T = components(G, H, target_string)
return T, H, statistics
def disambiguate(minimum_spanning_tree, root_hub_list,
context_list, target_string):
"""Matches contexts to senses.
......@@ -493,82 +546,46 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
return mapping_dict
# our main function, here the main stepps for word sense induction are called
def word_sense_induction(topic_id, topic_name, result_list):
#buffer for useful information
out_buffer = '\n'
#path for output(directory)
output_path = config.output
#removes trailing new_lines
old_target_string = topic_name.strip() #original target
if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
return None
out_buffer += ("[A] Word sense induction for '"+old_target_string+"':\n")
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
target_string = old_target_string[4:]
else:
target_string = old_target_string
#writes headline for output files
f = open(output_path+old_target_string+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
#counts occurences of single words, as well as cooccurrences, saves it in dictionary
print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
#builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+old_target_string+')')
G = build_graph(node_freq_dict, edge_freq_dict)
out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges)))
#finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.\t('+old_target_string+')')
H = root_hubs(G, edge_freq_dict)
out_buffer += '[A] Root hubs:\n'
def main(topic_id, topic_name, result_list):
"""
"""
#adds sense inventory to buffer with some common neighbors for context
i = 1 #sense index
for h in H:
mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6]
out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
i += 1
print('[a]', 'Inducing word senses for {}.'.format(topic_name))
T, H, statistics = induce(topic_name, result_list)
#performs minimum_spanning_tree algorithm on graph
print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')')
T = components(G, H, target_string)
#matches senses to clusters
print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
D = disambiguate(T, H, result_list[topic_id], target_string)
out_buffer += ('[A] Mapping: \n')
#collect statistics from result.
cluster_count = 0
cluster_length_list = list()
for cluster,result_list in D.items():
out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list])))
cluster_length = len(result_list)
if cluster_length != 0:
cluster_count += 1
cluster_length_list.append(cluster_length)
statistics['mean_cluster_length'] = np.mean(cluster_length_list)
statistics['cluster_count'] = cluster_count
#prints buffer
print('[a]', 'Writing to file.\t('+old_target_string+')')
print(out_buffer)
f = open(output_path+old_target_string+'.absinth', 'w')
f.write('subTopicID\tresultID\n')
#writes clustering to file
for cluster,result_list in D.items():
for result in result_list:
f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
f.close()
def read_dataset(data_path):
# results.txt includes the queries for a given target word
......@@ -600,8 +617,9 @@ def read_dataset(data_path):
return results, topics
def main():
if __name__ == '__main__':
# If absinth.py is run in test environment.
if '-t' in sys.argv:
data_path = config.test
......@@ -619,11 +637,7 @@ def main():
with Pool(process_count) as pool:
parameter_list = [(topic_id, topic_name, results)
for topic_id,topic_name in topics.items()]
pool.starmap(word_sense_induction, parameter_list)
pool.starmap(main, parameter_list)
#for topic_id,topic_name in topics.items():
#word_sense_induction(topic_id,topic_name, results)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment