Skip to content
Snippets Groups Projects
Commit a9b0962e authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Further commenting.

parent a2e527eb
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import sys
import matplotlib
matplotlib.use("Agg")
print('[A] Loading ' + sys.argv[0] + '.\n') print('[A] Loading ' + sys.argv[0] + '.\n')
import os # for reading files import config
import networkx as nx # for visualisation import networkx as nx # for visualisation
from copy import deepcopy import numpy as np
from nltk.corpus import stopwords import os # for reading files
import numpy as np # for calculations import pprint
import random
import re import re
import spacy # for nlp import spacy # for nlp
from multiprocessing import Pool from multiprocessing import Pool
import random from nltk.corpus import stopwords
import matplotlib.pyplot as plt from copy import deepcopy
import config
nlp = spacy.load('en') # standard english nlp nlp = spacy.load('en') # standard english nlp
def read_dataset(data_path):
"""Collects topics.txt and results.txt.
"""
results = dict()
with open(data_path+'results.txt', 'r') as results_file:
for line in results_file.readlines()[1:]:
l = line.split('\t')
id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
if id1 not in results:
results[id1]=list()
results[id1].append(" ".join(l[2:]).strip()) # here I join title and snippet, the URL is ignored
# topics.txt is a list of target words
topics = dict()
with open(data_path+'topics.txt', 'r') as topics_file:
for line in topics_file.readlines()[1:]:
l = line.split('\t')
topics[l[0]] = l[1].strip()
return results, topics
def frequencies(target_string, search_result_list): def frequencies(target_string, search_result_list):
"""Counts occurrences of nodes and cooccurrences. """Counts occurrences of nodes and cooccurrences.
...@@ -168,7 +201,10 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): ...@@ -168,7 +201,10 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
# Add only tokens with allowed tags to nodes. # Add only tokens with allowed tags to nodes.
elif token.tag_ in allowed_tag_list: elif token.tag_ in allowed_tag_list:
token_set.add(token.lemma_) if config.lemma == True:
token_set.add(token.lemma_)
else:
token_set.add(token.text)
context_size = len(token_set) context_size = len(token_set)
...@@ -416,78 +452,96 @@ def score(graph, component, root_hub_list): ...@@ -416,78 +452,96 @@ def score(graph, component, root_hub_list):
def induce(topic_name, result_list): def induce(topic_name, result_list):
""" """Induces word senses for a given topic from corpus.
Counts frequencies from corpus and search result list, builds graph from
these counts (with some filters). Root hubs (senses) are collected from
this graph.
Args:
topic_name: Target string.
result_list: List of search result (context) strings.
Returns:
root_hub_list: List of root hub strings (senses).
stat dict: Various statistics.
""" """
statistics = dict() stat_dict = dict()
#removes trailing new_lines
old_target_string = topic_name.strip() #original target
if old_target_string.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: if topic_name in [output_file_name.replace('.absinth', '')
for output_file_name in os.listdir(config.output)]:
return None return None
statistics['target'] = old_target_string else:
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense stat_dict['target'] = topic_name
if old_target_string[:4] == 'the_' and old_target_string.count('_') >= 2:
target_string = old_target_string[4:] #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
if topic_name[:4] == 'the_' and topic_name.count('_') > 1:
target_string = topic_name[4:]
else:
target_string = topic_name
else: print('[a]', 'Counting nodes and edges.\t('+topic_name+')')
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list)
target_string = old_target_string #builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+topic_name+')')
#counts occurences of single words, as well as cooccurrences, saves it in dictionary graph = build_graph(node_freq_dict, edge_freq_dict)
print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id]) stat_dict['node count'] = len(graph.nodes)
stat_dict['edge count'] = len(graph.edges)
#builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+old_target_string+')')
G = build_graph(node_freq_dict, edge_freq_dict)
statistics['node count'] = len(G.nodes)
statistics['edge count'] = len(G.edges)
#finds root hubs (senses) within the graph + more filters for these #finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.\t('+old_target_string+')') print('[a]', 'Collecting root hubs.\t('+topic_name+')')
H = root_hubs(G, edge_freq_dict) root_hub_list = root_hubs(graph, edge_freq_dict)
#adds sense inventory to buffer with some common neighbors for context #adds sense inventory to buffer with some common neighbors for context
statistics['hubs'] = dict() stat_dict['hubs'] = dict()
for h in H:
mfn = sorted(G.adj[h], key=lambda x: edge_freq_dict[h,x] if h < x else edge_freq_dict[x, h], reverse=True)[:6] for root_hub in root_hub_list:
statistics['hubs'][h] = mfn
by_frequency = lambda node: edge_freq_dict[root_hub,node] \
#performs minimum_spanning_tree algorithm on graph if root_hub < node \
print('[a]', 'Building minimum spanning tree.\t('+old_target_string+')') else edge_freq_dict[node, root_hub]
T = components(G, H, target_string)
most_frequent_neighbor_list = sorted(graph.adj[root_hub],
key=by_frequency, reverse=True)
stat_dict['hubs'][root_hub] = most_frequent_neighbor_list[:6]
return T, H, statistics return graph, root_hub_list, stat_dict
def disambiguate(minimum_spanning_tree, root_hub_list, def disambiguate(graph, root_hub_list, context_list, topic_name):
context_list, target_string):
"""Matches contexts to senses. """Matches contexts to senses.
Adds up scores for each token in a context string and matches the context Builds minimum spanning tree from graph.
to the root hub with the highest score. Adds up scores based on tree node distance for each token in a context
string and matches the context to the root hub with the highest score.
Args: Args:
minimum_spanning_tree: Minimum spanning tree with target as root. graph: Weighted undirected graph.
root_hub_list: List of strings of root hubs (senses). root_hub_list: List of strings of root hubs (senses).
context_list: List of sentence strings that are to be clustered. context_list: List of sentence strings that are to be clustered.
target_string: String of target word, also root of MST. topic_name: String of target word, also root of MST.
Returns: Returns:
mapping_dict: Dictionary of root hubs (senses) as keys and context ids mapping_dict: Dictionary of root hubs (senses) as keys and context ids
as values. as values.
""" """
target_string = target_string.replace('_', ' ') #performs minimum_spanning_tree algorithm on graph
context_list = [context.lower().strip().replace(target_string, '') print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
minimum_spanning_tree = components(graph, root_hub_list, topic_name)
spaced_topic_name = topic_name.replace('_', ' ')
context_list = [context.lower().strip().replace(spaced_topic_name, '')
for context in context_list] for context in context_list]
score_dict = dict() #memoisation for scores score_dict = dict() #memoisation for scores
...@@ -505,23 +559,27 @@ def disambiguate(minimum_spanning_tree, root_hub_list, ...@@ -505,23 +559,27 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
idx += 1 #index based on position in list idx += 1 #index based on position in list
processed_context = nlp(context) processed_context = nlp(context)
text_list = [token.text for token in processed_context] #tokens
if config.lemma == True:
token_list = [token.lemma_ for token in processed_context] #tokens
else:
token_list = [token.text for token in processed_context] #tokens
score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
for text in text_list: for token in token_list:
if text in minimum_spanning_tree.nodes: #if word wasn't filtered out if token in minimum_spanning_tree.nodes: #if word wasn't filtered out
if text in score_dict: #memoisation if token in score_dict: #memoisation
new_scores = score_dict[text] new_scores = score_dict[token]
else: else:
new_score = score(minimum_spanning_tree, new_score = score(minimum_spanning_tree,
text, root_hub_list) token, root_hub_list)
score_dict[text] = new_score #memoisation score_dict[token] = new_score #memoisation
score_array += new_score score_array += new_score
...@@ -546,77 +604,66 @@ def disambiguate(minimum_spanning_tree, root_hub_list, ...@@ -546,77 +604,66 @@ def disambiguate(minimum_spanning_tree, root_hub_list,
return mapping_dict return mapping_dict
def main(topic_id, topic_name, result_list): def main(topic_id, topic_name, result_dict):
""" """Calls induction and disambiguation functions, performs main task.
The task is to both induce senses and match search results to them. This
function calls in much the same way induce() and disambiguate() to perform
these sub tasks. The result is then written to the output directory
specified in config.py.
Args:
topic_id: Index of topic in topics.txt.
topic_name: Target string.
result_dict: Dictionary with topic_id as key and list of search queries
(from results.txt) as values.
Returns:
None
""" """
print('[a]', 'Inducing word senses for {}.'.format(topic_name)) print('[a]', 'Inducing word senses for {}.'.format(topic_name))
T, H, statistics = induce(topic_name, result_list) graph, root_hub_list, stat_dict = induce(topic_name,
result_dict[topic_id])
#matches senses to clusters #matches senses to clusters
print('[a]', 'Disambiguating result_list.\t('+old_target_string+')') print('[a]', 'Disambiguating result_list.\t('+topic_name+')')
D = disambiguate(T, H, result_list[topic_id], target_string) mapping_dict = disambiguate(graph, root_hub_list,
result_dict[topic_id], topic_name)
#collect statistics from result. #collect statistics from result.
cluster_count = 0 cluster_count = 0
cluster_length_list = list() cluster_length_list = list()
for cluster,result_list in D.items():
for cluster,result_list in mapping_dict.items():
cluster_length = len(result_list) cluster_length = len(result_list)
if cluster_length != 0: if cluster_length != 0:
cluster_count += 1 cluster_count += 1
cluster_length_list.append(cluster_length) cluster_length_list.append(cluster_length)
statistics['mean_cluster_length'] = np.mean(cluster_length_list)
statistics['cluster_count'] = cluster_count stat_dict['mean_cluster_length'] = np.mean(cluster_length_list)
stat_dict['cluster_count'] = cluster_count
#prints buffer print('[a]', 'Writing to file.\t('+topic_name+')')
print('[a]', 'Writing to file.\t('+old_target_string+')')
output_path = config.output
output_file_name = output_path+topic_name+'.absinth'
with open(output_file_name, 'w') as output_file:
f = open(output_path+old_target_string+'.absinth', 'w') output_file.write('subTopicID\tresultID\n')
f.write('subTopicID\tresultID\n')
#writes clustering to file for cluster_id,result_list in mapping_dict.items():
for cluster,result_list in D.items(): for result_id in result_list:
for result in result_list: output_line = '{}.{}\t{}.{}\n'.format(topic_id, cluster_id,
f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n') topic_id, result_id)
output_file.write(output_line)
f.close()
def read_dataset(data_path):
# results.txt includes the queries for a given target word
results = dict()
with open(data_path+'results.txt', 'r') as results_file:
for line in results_file.readlines()[1:]:
l = line.split('\t')
id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
if id1 not in results:
results[id1]=list()
results[id1].append(" ".join(l[2:])) # here I join title and snippet, the URL is ignored pprint.pprint(stat_dict)
# topics.txt is a list of target words
topics = dict()
with open(data_path+'topics.txt', 'r') as topics_file:
for line in topics_file.readlines()[1:]:
l = line.split('\t')
topics[l[0]] = l[1]
return results, topics
if __name__ == '__main__': if __name__ == '__main__':
...@@ -626,7 +673,7 @@ if __name__ == '__main__': ...@@ -626,7 +673,7 @@ if __name__ == '__main__':
else: else:
data_path = config.dataset data_path = config.dataset
results, topics = read_dataset(data_path) result_dict, topic_dict = read_dataset(data_path)
# Enables manual setting of process count. # Enables manual setting of process count.
if '-p' in sys.argv: if '-p' in sys.argv:
...@@ -635,8 +682,8 @@ if __name__ == '__main__': ...@@ -635,8 +682,8 @@ if __name__ == '__main__':
process_count = 1 process_count = 1
with Pool(process_count) as pool: with Pool(process_count) as pool:
parameter_list = [(topic_id, topic_name, results) parameter_list = [(topic_id, topic_name, result_dict)
for topic_id,topic_name in topics.items()] for topic_id,topic_name in topic_dict.items()]
pool.starmap(main, parameter_list) pool.starmap(main, parameter_list)
#for topic_id,topic_name in topics.items(): #for topic_id,topic_name in topics.items():
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment