diff --git a/src/absinth.py b/src/absinth.py index f9bbc220f05adbdc0a1c5180f2c82a0f1e2389d1..a082b431c9df5c9a06728be03c213eb9a8b951c7 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -415,6 +415,59 @@ def score(graph, component, root_hub_list): return score_array +def induce(topic_name, result_list): + """ + + + """ + + statistics = dict() + + #removes trailing new_lines + old_target_string = topic_name.strip() #original target + + if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: + return None + + statistics['target'] = old_target_string + + #in topics longer than two words, the leading 'the' can generally be removed without changing the sense + if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2: + + target_string = old_target_string[4:] + + else: + + target_string = old_target_string + + #counts occurences of single words, as well as cooccurrences, saves it in dictionary + print('[a]', 'Counting nodes and edges.\t('+old_target_string+')') + node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id]) + + #builds graph from these dictionaries, also applies multiple filters + print('[a]', 'Building graph.\t('+old_target_string+')') + G = build_graph(node_freq_dict, edge_freq_dict) + + statistics['node count'] = len(G.nodes) + statistics['edge count'] = len(G.edges) + + #finds root hubs (senses) within the graph + more filters for these + print('[a]', 'Collecting root hubs.\t('+old_target_string+')') + H = root_hubs(G, edge_freq_dict) + + #adds sense inventory to buffer with some common neighbors for context + statistics['hubs'] = dict() + for h in H: + mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6] + statistics['hubs'][h] = mfn + + #performs minimum_spanning_tree algorithm on graph + print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')') + T = components(G, H, target_string) + + return T, H, statistics + + def disambiguate(minimum_spanning_tree, root_hub_list, context_list, target_string): """Matches contexts to senses. @@ -493,82 +546,46 @@ def disambiguate(minimum_spanning_tree, root_hub_list, return mapping_dict -# our main function, here the main stepps for word sense induction are called -def word_sense_induction(topic_id, topic_name, result_list): - - #buffer for useful information - out_buffer = '\n' - - #path for output(directory) - output_path = config.output - - #removes trailing new_lines - old_target_string = topic_name.strip() #original target - - if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: - return None - - out_buffer += ("[A] Word sense induction for '"+old_target_string+"':\n") - - #in topics longer than two words, the leading 'the' can generally be removed without changing the sense - if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2: - - target_string = old_target_string[4:] - - else: - - target_string = old_target_string - - #writes headline for output files - f = open(output_path+old_target_string+'.absinth', 'w') - f.write('subTopicID\tresultID\n') - - #counts occurences of single words, as well as cooccurrences, saves it in dictionary - print('[a]', 'Counting nodes and edges.\t('+old_target_string+')') - node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id]) - - #builds graph from these dictionaries, also applies multiple filters - print('[a]', 'Building graph.\t('+old_target_string+')') - G = build_graph(node_freq_dict, edge_freq_dict) - out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges))) - - #finds root hubs (senses) within the graph + more filters for these - print('[a]', 'Collecting root hubs.\t('+old_target_string+')') - H = root_hubs(G, edge_freq_dict) - out_buffer += '[A] Root hubs:\n' +def main(topic_id, topic_name, result_list): + """ + + + """ - #adds sense inventory to buffer with some common neighbors for context - i = 1 #sense index - for h in H: - - mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6] - out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn))) - i += 1 + print('[a]', 'Inducing word senses for {}.'.format(topic_name)) + T, H, statistics = induce(topic_name, result_list) - #performs minimum_spanning_tree algorithm on graph - print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')') - T = components(G, H, target_string) - #matches senses to clusters print('[a]', 'Disambiguating result_list.\t('+old_target_string+')') D = disambiguate(T, H, result_list[topic_id], target_string) - out_buffer += ('[A] Mapping: \n') + #collect statistics from result. + cluster_count = 0 + cluster_length_list = list() for cluster,result_list in D.items(): - out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list]))) - + cluster_length = len(result_list) + if cluster_length != 0: + cluster_count += 1 + cluster_length_list.append(cluster_length) + statistics['mean_cluster_length'] = np.mean(cluster_length_list) + statistics['cluster_count'] = cluster_count + #prints buffer print('[a]', 'Writing to file.\t('+old_target_string+')') - print(out_buffer) + + f = open(output_path+old_target_string+'.absinth', 'w') + + f.write('subTopicID\tresultID\n') + #writes clustering to file for cluster,result_list in D.items(): for result in result_list: f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n') f.close() - - + + def read_dataset(data_path): # results.txt includes the queries for a given target word @@ -600,8 +617,9 @@ def read_dataset(data_path): return results, topics -def main(): - + + +if __name__ == '__main__': # If absinth.py is run in test environment. if '-t' in sys.argv: data_path = config.test @@ -619,11 +637,7 @@ def main(): with Pool(process_count) as pool: parameter_list = [(topic_id, topic_name, results) for topic_id,topic_name in topics.items()] - pool.starmap(word_sense_induction, parameter_list) + pool.starmap(main, parameter_list) #for topic_id,topic_name in topics.items(): #word_sense_induction(topic_id,topic_name, results) - - -if __name__ == '__main__': - main()