diff --git a/src/absinth.py b/src/absinth.py index b51a5462419dabd4e7f2a1fb4f829a5ce7616a4a..f9bbc220f05adbdc0a1c5180f2c82a0f1e2389d1 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -168,7 +168,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): # Add only tokens with allowed tags to nodes. elif token.tag_ in allowed_tag_list: - token_set.add(token.text) + token_set.add(token.lemma_) context_size = len(token_set) @@ -209,20 +209,20 @@ def build_graph(node_freq_dict, edge_freq_dict): tokens within every context the target occurs in. Returns: - cooccurence_graph: Filtered undirected dice weighted small word - cooccurence graph for a given target entity. + cooccurrence_graph: Filtered undirected dice weighted small word + cooccurrence graph for a given target entity. """ min_node_freq = config.min_node_freq min_edge_freq = config.min_edge_freq max_weight = config.max_weight - cooccurence_graph = nx.Graph() + cooccurrence_graph = nx.Graph() for node, frequency in node_freq_dict.items(): if frequency >= min_node_freq: - cooccurence_graph.add_node(node) + cooccurrence_graph.add_node(node) for node_tuple, frequency in edge_freq_dict.items(): @@ -230,11 +230,11 @@ def build_graph(node_freq_dict, edge_freq_dict): continue - elif node_tuple[0] not in cooccurence_graph.nodes: + elif node_tuple[0] not in cooccurrence_graph.nodes: continue - elif node_tuple[1] not in cooccurence_graph.nodes: + elif node_tuple[1] not in cooccurrence_graph.nodes: continue @@ -247,25 +247,25 @@ def build_graph(node_freq_dict, edge_freq_dict): prob_0 = cooccurrence_frequency / node0_frequency prob_1 = cooccurrence_frequency / node1_frequency - #best_weight = 1 - max(prob_0, prob_1) - dice_weight = 1 - ((prob_0 + prob_1) / 2) + best_weight = 1 - max(prob_0, prob_1) + #dice_weight = 1 - ((prob_0 + prob_1) / 2) - if dice_weight <= max_weight: + if best_weight <= max_weight: - cooccurence_graph.add_edge(*node_tuple, weight=dice_weight) + cooccurrence_graph.add_edge(*node_tuple, weight=best_weight) else: pass - return cooccurence_graph + return cooccurrence_graph def root_hubs(graph, edge_freq_dict): """Identifies senses (root hubs) by choosing nodes with high degrees Selects root hubs according to the algorithm in Véronis (2004). Nodes with - high degree and neighbors with low weights (high cooccurence) are chosen + high degree and neighbors with low weights (high cooccurrence) are chosen until there are no more viable candidates. A root hub candidate is every node that is not already a hub and is not a neighbor of one.