diff --git a/src/absinth.py b/src/absinth.py index dba97025dd8e8b5c3b02712ed9db493dd278b880..601f6103cdb4a33a7f574445ca7b44102ea1ef21 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -33,8 +33,8 @@ import pprint import random import re import spacy # for nlp + from multiprocessing import Pool -from nltk.corpus import stopwords from copy import deepcopy @@ -201,7 +201,7 @@ def process_file(context_list: list, target_string: str, spaced_target_string = target_string.replace('_', ' ') - stopword_list = set(stopwords.words('english') + config.stop_words) + stopword_list = config.stop_words allowed_tag_list = config.allowed_tags min_context_size = config.min_context_size @@ -227,7 +227,7 @@ def process_file(context_list: list, target_string: str, pass # Do not add stop words to nodes. - elif token.text in stopword_list: + elif token.is_stop or token.text in stopword_list: pass # Add only tokens with allowed tags to nodes. @@ -548,8 +548,154 @@ def induce(topic_name: str, result_list: list) -> (nx.Graph, list, dict): return graph, root_hub_list, stat_dict +def colour_graph(graph: nx.Graph, root_hub_list: list) -> nx.Graph: + """Colours graph accoring to root hubs. + + Evolving network that colours neighboring nodes iterative. + + Args: + graph: Weighted undirected graph. + root_hub_list: List of senses. + + Returns: + Coloured graph. + """ + + + for node in graph.nodes: + if node in root_hub_list: + graph.node[node]['sense'] = root_hub_list.index(node) + else: + graph.node[node]['sense'] = None + + max_iteration_count = config.max_colour_iteration_count + + iteration_count = 0 + stable = False + while stable == False and iteration_count <= max_iteration_count: + + graph_copy = deepcopy(graph) + iteration_count += 1 + stable = True + + for node in graph.nodes: + + neighbor_weight_list = [0] * len(root_hub_list) + + for neighbor in graph_copy[node]: + + if graph_copy.node[neighbor]['sense'] == None: + pass + else: + neighbor_weight_list[graph_copy.node[neighbor]['sense']] \ + += 1 - graph_copy[node][neighbor]['weight'] + + if any(neighbor_weight_list): + + old_colour = graph_copy.node[node]['sense'] + new_colour = np.argmax(neighbor_weight_list) + + if old_colour != new_colour: + stable = False + graph.node[node]['sense'] = new_colour + + else: + pass + + else: + + pass + + return graph + +def disambiguate_colour(graph: nx.Graph, root_hub_list, context_list: list) -> dict: + """Clusters senses to root hubs using a coloured graph. + + This algorithm colours the graph using (a method with a name i don't know) + and calculates scores for each root hub given a context based on this graph. + + Args: + graph: Undirected weighted graph. + root_hub_list: List of root hubs (senses). + context_list: List of search result strings to be clustered. + + Returns: + A dictionary with root hub IDs as keys and context indices as values. + """ + + coloured_graph = colour_graph(graph, root_hub_list) + + mapping_dict = {i:list() for i in range(1,len(root_hub_list)+1)} + + if len(root_hub_list) == 0: + + mapping_dict = {0:[i for i in range(1, len(context_list)+1)]} + + return mapping_dict + + context_id = 0 + for context in context_list: + + context_id += 1 + score = [0]*len(root_hub_list) + parsed_context = nlp(context) + + for token in parsed_context: + + if config.lemma == True: + text = token.lemma_ + else: + text = token.text + + if text in coloured_graph.nodes: + + text_colour = coloured_graph.node[text]['sense'] + + if text_colour == None: + + pass + + else: + + text_root = root_hub_list[text_colour] + + if nx.has_path(coloured_graph , text, text_root): + + + shortest_path = nx.shortest_path(coloured_graph , + text, + root_hub_list[text_colour], + 'weight') + total_weight = 0 + + # Add weights of every sub-path. + for i in range(1, len(shortest_path)): + sub_from, sub_to = shortest_path[i-1], shortest_path[i] + total_weight += \ + coloured_graph [sub_from][sub_to]['weight'] -def disambiguate(graph: nx.Graph, root_hub_list: list, + + score[text_colour] += 1/(1+total_weight) + + else: + pass + + else: + pass + + if any(score): + + mapping_dict[np.argmax(score)+1].append(context_id) + + else: + + pass + + + return mapping_dict + + +def disambiguate_mst(graph: nx.Graph, root_hub_list: list, context_list: list, topic_name: str) -> dict: """Matches contexts to senses. @@ -568,7 +714,6 @@ def disambiguate(graph: nx.Graph, root_hub_list: list, """ #performs minimum_spanning_tree algorithm on graph - print('[a]', 'Building minimum spanning tree.\t('+topic_name+')') minimum_spanning_tree = components(graph, root_hub_list, topic_name) spaced_topic_name = topic_name.replace('_', ' ') @@ -581,7 +726,9 @@ def disambiguate(graph: nx.Graph, root_hub_list: list, #if no sense is found for a target word, we should assume that there only is one sense if len(root_hub_list) == 0: - return {0:[i for i in range(1, len(context_list)+1)]} + mapping_dict = {0:[i for i in range(1, len(context_list)+1)]} + + return mapping_dict idx = 0 @@ -639,8 +786,8 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: """Calls induction and disambiguation functions, performs main task. The task is to both induce senses and match search results to them. This - function calls in much the same way induce() and disambiguate() to perform - these sub tasks. The result is then written to the output directory + function calls in much the same way induce() and disambiguate_mst() to + perform these sub tasks. The result is then written to the output directory specified in config.py. Args: @@ -657,8 +804,15 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: #matches senses to clusters print('[a]', 'Disambiguating result_list.\t('+topic_name+')') - mapping_dict = disambiguate(graph, root_hub_list, - result_dict[topic_id], topic_name) + if config.use_colouring == True: + print('[a]', 'Colouring graph.\t('+topic_name+')') + mapping_dict = disambiguate_colour(graph, root_hub_list, + result_dict[topic_id]) + else: + + print('[a]', 'Building minimum spanning tree.\t('+topic_name+')') + mapping_dict = disambiguate_mst(graph, root_hub_list, + result_dict[topic_id], topic_name) #collect statistics from result. cluster_count = 0 @@ -696,6 +850,13 @@ def main(topic_id: int, topic_name: str, result_dict: dict) -> None: if __name__ == '__main__': + """Check for modifiers and call main(). + + Only called when absinth.py is started manually. Checks for various + modifiers, i.e. test environment and number of processes to run + simultaneously. + """ + # If absinth.py is run in test environment. if '-t' in sys.argv: data_path = config.test @@ -714,6 +875,3 @@ if __name__ == '__main__': parameter_list = [(topic_id, topic_name, result_dict) for topic_id,topic_name in topic_dict.items()] pool.starmap(main, parameter_list) - - #for topic_id,topic_name in topics.items(): - #word_sense_induction(topic_id,topic_name, results) diff --git a/src/config.py b/src/config.py index 5e22df01c61feb62847707c886ace5225efc4d8a..d40c5f2612036a91c03a835a5a512db029ea53ee 100644 --- a/src/config.py +++ b/src/config.py @@ -43,10 +43,16 @@ max_weight = 0.9 Choose minimum number of neighbors and maximum median weight of the most frequent neighbors of a node for root hubs. - the threshold is calculated using the media of the same number of neighbors declared in min_neighbors. ''' -min_neighbors = 5 +min_neighbors = 4 threshold = 0.8 ''' Choose whether or not the tokens should be lemmatised. ''' -lemma = True +lemma = False + +''' +colouring options +''' +use_colouring = True +max_colour_iteration_count = 50