Skip to content
Snippets Groups Projects
Commit 565b5233 authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Reimplement components() + more comment reform.

parent 657eb8e5
No related branches found
No related tags found
No related merge requests found
......@@ -44,7 +44,7 @@ def frequencies(target_string, search_result_list):
bracketed_target_string = '('+target_string+')'
# Remove unnecessary tokens from snippets
# Remove unnecessary tokens from snippets.
_search_result_list = list()
for r in search_result_list:
r = r.replace('<b>', '')
......@@ -53,13 +53,12 @@ def frequencies(target_string, search_result_list):
r = r.strip()
_search_result_list.append(r)
#initialises frequencies with counts from results
# Initialise frequencies with counts from results.
node_freq_dict, edge_freq_dict = process_file(_search_result_list,
target_string,
dict(),
dict())
#names of corpus files
corpus_file_path_list = [corpus_path + f for f in os.listdir(corpus_path)]
corpus_size = len(corpus_file_path_list)
......@@ -69,7 +68,7 @@ def frequencies(target_string, search_result_list):
node_count = len(node_freq_dict)
edge_count = len(edge_freq_dict)
#prints update after every 11th of the corpus is parsed
# Print update after every 11th of the corpus is parsed.
if processed_file_count % int(corpus_size/11) == 0:
file_ratio = processed_file_count / corpus_size
......@@ -78,7 +77,7 @@ def frequencies(target_string, search_result_list):
ratios = [file_ratio, max_node_ratio, max_edge_ratio]
#uses the ratio closest to 100%.
# Use ratio closest to 100%.
highest_ratio = int((max(ratios))*100)
print('[a] ~{:02d}%\tNodes: {}\tEdges: {}\t{}.'.format(highest_ratio,
......@@ -86,7 +85,6 @@ def frequencies(target_string, search_result_list):
edge_count,
bracketed_target_string))
#checks maximum node values
if node_count > max_node_count:
print('[a] 100%\tNodes: {}\tEdges: {}\t{}.'.format(node_count,
edge_count,
......@@ -148,11 +146,11 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
for context in context_list:
context = context.lower()
if spaced_target_string in context: #greedy pre selection, not perfect
if spaced_target_string in context: # Pre-select lines greedy.
token_set = set() #set of node candidates
token_set = set()
#This replacement allows target to be treated as single entity.
# Allow target to be treated as single entity.
context = context.replace(spaced_target_string, target_string)
processed_context = nlp(context)
......@@ -160,15 +158,15 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
for token in processed_context:
#doesn't add target word to nodes
# Do not add target word to nodes.
if token.text == target_string:
pass
#doesn't add stop words to nodes
# Do not add stop words to nodes.
elif token.text in stopword_list:
pass
#only adds tokens with allowed tags to nodes
# Add only tokens with allowed tags to nodes.
elif token.tag_ in allowed_tag_list:
token_set.add(token.text)
......@@ -190,14 +188,14 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
else:
edge_freq_dict[edge] = 1
#if a file is corrupted (can't always be catched with if-else)
# If file is corrupted (can't always be catched with if-else), ignore file.
except UnicodeDecodeError:
pass
return node_freq_dict, edge_freq_dict
#build graph from frequency dictionaries
def build_graph(node_freq_dict, edge_freq_dict):
"""Builds undirected weighted graph from dictionaries.
......@@ -221,13 +219,11 @@ def build_graph(node_freq_dict, edge_freq_dict):
cooccurence_graph = nx.Graph()
#node : node frequency
for node, frequency in node_freq_dict.items():
if frequency >= min_node_freq:
cooccurence_graph.add_node(node)
#edge : edge frequency
for node_tuple, frequency in edge_freq_dict.items():
if frequency < min_edge_freq:
......@@ -265,166 +261,246 @@ def build_graph(node_freq_dict, edge_freq_dict):
return cooccurence_graph
#Identifies senses by choosing nodes with high degrees
def root_hubs(graph, edge_freq_dict, min_neighbors=4, theshold=0.8):
def root_hubs(graph, edge_freq_dict):
"""Identifies senses (root hubs) by choosing nodes with high degrees
Selects root hubs according to the algorithm in Véronis (2004). Nodes with
high degree and neighbors with low weights (high cooccurence) are chosen
until there are no more viable candidates. A root hub candidate is every
node that is not already a hub and is not a neighbor of one.
Args:
graph: Weighted undirected graph.
edge_freq_dict: Dictionary of weights for every tuple in our graph.
Returns:
hub_list: List of root hubs, i.e. strings that are selected using the
algorithm explained above.
"""
min_neighbors = config.min_neighbors
threshold = config.threshold
G = deepcopy(graph)
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # sorts according to degree
H = list() #output list
# Allow operations on graph without altering original one.
graph_copy = deepcopy(graph)
# Sort according to degree (number of neighbors).
candidate_list = sorted(graph_copy.nodes,
key=lambda node: graph_copy.degree[node],
reverse=True)
hub_list = list()
while V:
# While there are still candidates, search for root hubs.
while candidate_list:
v = V[0] #best hub candidate
candidate = candidate_list[0] #best hub candidate
if G.degree[v] >= min_neighbors:
if graph_copy.degree[candidate] >= min_neighbors:
mfn = sorted(G.adj[v], key=lambda key: edge_freq_dict[v,key] if v < key else edge_freq_dict[key, v], reverse=True)[:min_neighbors] #most frequent neighbors
by_frequency = lambda node: edge_freq_dict[candidate,node] \
if candidate < node \
else edge_freq_dict[node,candidate]
most_frequent_neighbor_list = sorted(graph_copy.adj[candidate],
key=by_frequency,
reverse=True) [:min_neighbors]
if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold
# If the mean weight of the most frequent neighbors cooccur
# frequently enough with candidate, the candidate is approved.
if np.mean([graph_copy.edges[candidate,node]['weight']
for node in most_frequent_neighbor_list]) < threshold:
H.append(v)
# Add candidate as root hub.
hub_list.append(candidate)
#removes neighbors of new hub as hub candidates
for nbr in deepcopy(G).adj[v]:
# Remove neighbors of new hub as hub candidates.
for neighbor in deepcopy(graph_copy).adj[candidate]:
graph_copy.remove_node(neighbor)
G.remove_node(nbr)
#removes hub candidate
G.remove_node(v)
# Remove hub candidate.
graph_copy.remove_node(candidate)
#reorderd potential hubs after deletions
V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True)
# Reorder potential hubs after deletions.
candidate_list = sorted(graph_copy.nodes,
key=lambda node: graph_copy.degree[node],
reverse=True)
else:
return H
return hub_list
return H
return hub_list
#Components algorithm from Véronis (2004), converts graph for target into a MST
def components(graph, hubs, target_string):
def components(graph, root_hub_list, target_string):
"""Builds minimum spanning tree from graph and removes singletons.
G = deepcopy(graph)
H = hubs #root hubs
t = target_string
Applies components algorithm from Véronis (2004) and removes singletons.
#G.add_node(t)
#for h in H:
#G.add_edge(t,h,weight=0)
Args:
graph: Undirected weighted graph.
root_hub_list: List of strings of root hubs of graph.
target_string: Root of minimum spanning tree.
T = nx.minimum_spanning_tree(G)
Returns:
minimum_spanning_tree: Minimum spanning tree with target as
root and root hubs as direct children. Singletons removed.
"""
graph_copy = deepcopy(graph)
graph_copy.add_node(target_string)
for root_hub in root_hub_list:
graph_copy.add_edge(target_string,root_hub,weight=0)
minimum_spanning_tree = nx.minimum_spanning_tree(graph_copy)
#removes singletons
for node in deepcopy(T).nodes:
if len(T.adj[node]) == 0:
T.remove_node(node)
# Remove singletons, deepcopy for iteration while being altered.
for node in deepcopy(minimum_spanning_tree).nodes:
if len(minimum_spanning_tree.adj[node]) == 0:
minimum_spanning_tree.remove_node(node)
return T
return minimum_spanning_tree
#Calculates score for a given path in a minimum spanning tree
def score(graph, from_node, to_node):
def score(graph, component, root_hub_list):
"""Calculate score for a given component in a minimum spanning tree.
#if correct tree
if nx.has_path(graph, from_node, to_node):
# calculates shortest path (approximation for path with lowest total weight)
path = nx.shortest_path(graph, from_node, to_node, 'weight')
total_weight = 0
First the correct root for the component is chosen. If no root hub is
suitable, an empty array is returned. A score is calculated for the distance
of the component and its root and returned as part of an array filled with
zeroes.
#adds weights of every sub-path
for i in range(1, len(path)):
sub_from, sub_to = path[i-1], path[i]
total_weight += graph[sub_from][sub_to]['weight']
Args:
graph: Minimum spanning tree.
component: Node (string) from which the distances are to be calculated.
root_hub_list: List of strings of root hubs (senses) of original graph.
#the further the path, the lower the score
return 1/(1+total_weight)
else:
return 0
Returns:
score_array: Array with one score for the correct root hub and filled
with zeroes..
"""
root_hub_count = len(root_hub_list)
#Initialise score array.
score_array = np.zeros(root_hub_count)
# Find root of component.
distance_list = list()
for root_hub in root_hub_list:
if nx.has_path(graph, component, root_hub):
distance_list.append(1/(1+len(nx.shortest_path(graph, component, root_hub))))
else:
distance_list.append(0)
if sum(distance_list) == 0:
return score_array
root_idx = np.argmax(distance_list)
root = root_hub_list[root_idx]
shortest_path = nx.shortest_path(graph, component, root, 'weight')
total_weight = 0
# Add weights of every sub-path.
for i in range(1, len(shortest_path)):
sub_from, sub_to = shortest_path[i-1], shortest_path[i]
total_weight += graph[sub_from][sub_to]['weight']
score_array = np.zeros(root_hub_count)
score_array[root_idx] = 1/(1+total_weight)
return score_array
# Basically Word Sense Disambiguation, matches context to sense
def disambiguate(mst, hubs, contexts, target_string):
def disambiguate(minimum_spanning_tree, root_hub_list,
context_list, target_string):
"""Matches contexts to senses.
Adds up scores for each token in a context string and matches the context
to the root hub with the highest score.
Args:
minimum_spanning_tree: Minimum spanning tree with target as root.
root_hub_list: List of strings of root hubs (senses).
context_list: List of sentence strings that are to be clustered.
target_string: String of target word, also root of MST.
Returns:
mapping_dict: Dictionary of root hubs (senses) as keys and context ids
as values.
"""
target_string = target_string.replace('_', ' ')
T = mst #minimum spanning tree
H = hubs #root hubs
C = [c.lower().strip().replace(target_string, '') for c in contexts] #cleaned up contexts
context_list = [context.lower().strip().replace(target_string, '')
for context in context_list]
score_dict = dict() #memoisation for scores
mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
mapping_dict = {topic:[] for topic in range(1,len(root_hub_list)+1)}
#if no sense is found for a target word, we should assume that there only is one sense
if len(H) == 0:
if len(root_hub_list) == 0:
return {0:[i for i in range(1, len(C)+1)]}
return {0:[i for i in range(1, len(context_list)+1)]}
idx = 0
for c in C:
for context in context_list:
idx += 1 #index based on position in list
doc = nlp(c) #parsed context
texts = [tok.text for tok in doc] #tokens
processed_context = nlp(context)
text_list = [token.text for token in processed_context] #tokens
scores = np.zeros(len(H)) #initialise with zeros for every sense
score_array = np.zeros(len(root_hub_list)) #initialise with zeros for every sense
for text in texts:
for text in text_list:
if text in T.nodes: #if word wasn't filtered out
new_scores = list() #scores to be added to total scores
if text in minimum_spanning_tree.nodes: #if word wasn't filtered out
for h in H: #for each hub
if (text, h) in score_dict: #memoisation
if text in score_dict: #memoisation
new_scores.append(score_dict[(text,h)])
new_scores = score_dict[text]
else:
else:
new_score = score(T, text, h)
new_scores.append(new_score)
score_dict[(text,h)] = new_score #memoisation
new_score = score(minimum_spanning_tree,
text, root_hub_list)
score_dict[text] = new_score #memoisation
scores = scores + np.array(new_scores)
score_array += new_score
else:
pass
#if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
if np.max(scores) == 0:
# If disambiguator does not detect a sense, return singleton.
if np.max(score_array) == 0:
pass
else:
#applies sense with the highest score to context
max_score = np.max(scores)
argmax_score = np.argmax(scores)
# Apply sense with the highest score to context
max_score = np.max(score_array)
argmax_score = np.argmax(score_array)
#clusters begin at 1
# Clusters begin at 1
mapping_dict[argmax_score + 1].append(idx)
return mapping_dict
# our main function, here the main stepps for word sense induction are called
def word_sense_induction(topic_id, topic_name, results):
def word_sense_induction(topic_id, topic_name, result_list):
#buffer for useful information
out_buffer = '\n'
#path for output(directory)
output_path = './test/'#config.output
output_path = config.output
#removes trailing new_lines
old_target_string = topic_name.strip() #original target
......@@ -449,7 +525,7 @@ def word_sense_induction(topic_id, topic_name, results):
#counts occurences of single words, as well as cooccurrences, saves it in dictionary
print('[a]', 'Counting nodes and edges.\t('+old_target_string+')')
node_freq_dict, edge_freq_dict = frequencies(target_string, results[topic_id])
node_freq_dict, edge_freq_dict = frequencies(target_string, result_list[topic_id])
#builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+old_target_string+')')
......@@ -474,20 +550,20 @@ def word_sense_induction(topic_id, topic_name, results):
T = components(G, H, target_string)
#matches senses to clusters
print('[a]', 'Disambiguating results.\t('+old_target_string+')')
D = disambiguate(T, H, results[topic_id], target_string)
print('[a]', 'Disambiguating result_list.\t('+old_target_string+')')
D = disambiguate(T, H, result_list[topic_id], target_string)
out_buffer += ('[A] Mapping: \n')
for cluster,results in D.items():
out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
for cluster,result_list in D.items():
out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in result_list])))
#prints buffer
print('[a]', 'Writing to file.\t('+old_target_string+')')
print(out_buffer)
#writes clustering to file
for cluster,results in D.items():
for result in results:
for cluster,result_list in D.items():
for result in result_list:
f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
f.close()
......@@ -526,7 +602,7 @@ def read_dataset(data_path):
def main():
# If absinth.py is run in test environment
# If absinth.py is run in test environment.
if '-t' in sys.argv:
data_path = config.test
else:
......@@ -534,7 +610,13 @@ def main():
results, topics = read_dataset(data_path)
with Pool(2) as pool:
# Enables manual setting of process count.
if '-p' in sys.argv:
process_count = int(sys.argv[sys.argv.index('-p') + 1])
else:
process_count = 1
with Pool(process_count) as pool:
parameter_list = [(topic_id, topic_name, results)
for topic_id,topic_name in topics.items()]
pool.starmap(word_sense_induction, parameter_list)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment