Skip to content
Snippets Groups Projects
Commit e96d083d authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Added Support for Evolutionary Graph Clustering.

parent c71dbe78
No related branches found
No related tags found
No related merge requests found
......@@ -33,8 +33,8 @@ import pprint
import random
import re
import spacy # for nlp
from multiprocessing import Pool
from nltk.corpus import stopwords
from copy import deepcopy
......@@ -201,7 +201,7 @@ def process_file(context_list: list, target_string: str,
spaced_target_string = target_string.replace('_', ' ')
stopword_list = set(stopwords.words('english') + config.stop_words)
stopword_list = config.stop_words
allowed_tag_list = config.allowed_tags
min_context_size = config.min_context_size
......@@ -227,7 +227,7 @@ def process_file(context_list: list, target_string: str,
pass
# Do not add stop words to nodes.
elif token.text in stopword_list:
elif token.is_stop or token.text in stopword_list:
pass
# Add only tokens with allowed tags to nodes.
......@@ -548,8 +548,154 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict):
return graph, root_hub_list, stat_dict
def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph:
"""Colours graph accoring to root hubs.
Evolving network that colours neighboring nodes iterative.
Args:
graph: Weighted undirected graph.
root_hub_list: List of senses.
Returns:
Coloured graph.
"""
for node in graph.nodes:
if node in root_hub_list:
graph.node[node]['sense'] = root_hub_list.index(node)
else:
graph.node[node]['sense'] = None
max_iteration_count = config.max_colour_iteration_count
iteration_count = 0
stable = False
while stable == False and iteration_count <= max_iteration_count:
graph_copy = deepcopy(graph)
iteration_count += 1
stable = True
for node in graph.nodes:
neighbor_weight_list = [0] * len(root_hub_list)
for neighbor in graph_copy[node]:
if graph_copy.node[neighbor]['sense'] == None:
pass
else:
neighbor_weight_list[graph_copy.node[neighbor]['sense']] \
+= 1 - graph_copy[node][neighbor]['weight']
if any(neighbor_weight_list):
old_colour = graph_copy.node[node]['sense']
new_colour = np.argmax(neighbor_weight_list)
if old_colour != new_colour:
stable = False
graph.node[node]['sense'] = new_colour
else:
pass
else:
pass
return graph
def disambiguate_colour(graph: nx.Graph, root_hub_list, context_list: list) -> dict:
"""Clusters senses to root hubs using a coloured graph.
This algorithm colours the graph using (a method with a name i don't know)
and calculates scores for each root hub given a context based on this graph.
Args:
graph: Undirected weighted graph.
root_hub_list: List of root hubs (senses).
context_list: List of search result strings to be clustered.
Returns:
A dictionary with root hub IDs as keys and context indices as values.
"""
coloured_graph = colour_graph(graph, root_hub_list)
mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)}
if len(root_hub_list) == 0:
mapping_dict = {0:[i for i in range(1, len(context_list)+1)]}
return mapping_dict
context_id = 0
for context in context_list:
context_id += 1
score = [0]*len(root_hub_list)
parsed_context = nlp(context)
for token in parsed_context:
if config.lemma == True:
text = token.lemma_
else:
text = token.text
if text in coloured_graph.nodes:
text_colour = coloured_graph.node[text]['sense']
if text_colour == None:
pass
else:
text_root = root_hub_list[text_colour]
if nx.has_path(coloured_graph , text, text_root):
shortest_path = nx.shortest_path(coloured_graph ,
text,
root_hub_list[text_colour],
'weight')
total_weight = 0
# Add weights of every sub-path.
for i in range(1, len(shortest_path)):
sub_from, sub_to = shortest_path[i-1], shortest_path[i]
total_weight += \
coloured_graph [sub_from][sub_to]['weight']
def disambiguate(graph: nx.Graph, root_hub_list: list,
score[text_colour] += 1/(1+total_weight)
else:
pass
else:
pass
if any(score):
mapping_dict[np.argmax(score)+1].append(context_id)
else:
pass
return mapping_dict
def disambiguate_mst(graph: nx.Graph, root_hub_list: list,
context_list: list, topic_name: str) -> dict:
"""Matches contexts to senses.
......@@ -568,7 +714,6 @@ def disambiguate(graph: nx.Graph, root_hub_list: list,
"""
#performs minimum_spanning_tree algorithm on graph
print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
minimum_spanning_tree = components(graph, root_hub_list, topic_name)
spaced_topic_name = topic_name.replace('_', ' ')
......@@ -581,7 +726,9 @@ def disambiguate(graph: nx.Graph, root_hub_list: list,
#if no sense is found for a target word, we should assume that there only is one sense
if len(root_hub_list) == 0:
return {0:[i for i in range(1, len(context_list)+1)]}
mapping_dict = {0:[i for i in range(1, len(context_list)+1)]}
return mapping_dict
idx = 0
......@@ -639,8 +786,8 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
"""Calls induction and disambiguation functions, performs main task.
The task is to both induce senses and match search results to them. This
function calls in much the same way induce() and disambiguate() to perform
these sub tasks. The result is then written to the output directory
function calls in much the same way induce() and disambiguate_mst() to
perform these sub tasks. The result is then written to the output directory
specified in config.py.
Args:
......@@ -657,8 +804,15 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
#matches senses to clusters
print('[a]', 'Disambiguating result_list.\t('+topic_name+')')
mapping_dict = disambiguate(graph, root_hub_list,
result_dict[topic_id], topic_name)
if config.use_colouring == True:
print('[a]', 'Colouring graph.\t('+topic_name+')')
mapping_dict = disambiguate_colour(graph, root_hub_list,
result_dict[topic_id])
else:
print('[a]', 'Building minimum spanning tree.\t('+topic_name+')')
mapping_dict = disambiguate_mst(graph, root_hub_list,
result_dict[topic_id], topic_name)
#collect statistics from result.
cluster_count = 0
......@@ -696,6 +850,13 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None:
if __name__ == '__main__':
"""Check for modifiers and call main().
Only called when absinth.py is started manually. Checks for various
modifiers, i.e. test environment and number of processes to run
simultaneously.
"""
# If absinth.py is run in test environment.
if '-t' in sys.argv:
data_path = config.test
......@@ -714,6 +875,3 @@ if __name__ == '__main__':
parameter_list = [(topic_id, topic_name, result_dict)
for topic_id,topic_name in topic_dict.items()]
pool.starmap(main, parameter_list)
#for topic_id,topic_name in topics.items():
#word_sense_induction(topic_id,topic_name, results)
......@@ -43,10 +43,16 @@ max_weight = 0.9
Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs.
- the threshold is calculated using the media of the same number of neighbors declared in min_neighbors.
'''
min_neighbors = 5
min_neighbors = 4
threshold = 0.8
'''
Choose whether or not the tokens should be lemmatised.
'''
lemma = True
lemma = False
'''
colouring options
'''
use_colouring = True
max_colour_iteration_count = 50
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment