From 6ba4aad4cb062ebd13172771b917b381e0fd4d8c Mon Sep 17 00:00:00 2001 From: zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Sun, 18 Mar 2018 19:13:37 +0100 Subject: [PATCH] Spelling and lemmatisaton. --- src/absinth.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/absinth.py b/src/absinth.py index b51a546..f9bbc22 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -168,7 +168,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict): # Add only tokens with allowed tags to nodes. elif token.tag_ in allowed_tag_list: - token_set.add(token.text) + token_set.add(token.lemma_) context_size = len(token_set) @@ -209,20 +209,20 @@ def build_graph(node_freq_dict, edge_freq_dict): tokens within every context the target occurs in. Returns: - cooccurence_graph: Filtered undirected dice weighted small word - cooccurence graph for a given target entity. + cooccurrence_graph: Filtered undirected dice weighted small word + cooccurrence graph for a given target entity. """ min_node_freq = config.min_node_freq min_edge_freq = config.min_edge_freq max_weight = config.max_weight - cooccurence_graph = nx.Graph() + cooccurrence_graph = nx.Graph() for node, frequency in node_freq_dict.items(): if frequency >= min_node_freq: - cooccurence_graph.add_node(node) + cooccurrence_graph.add_node(node) for node_tuple, frequency in edge_freq_dict.items(): @@ -230,11 +230,11 @@ def build_graph(node_freq_dict, edge_freq_dict): continue - elif node_tuple[0] not in cooccurence_graph.nodes: + elif node_tuple[0] not in cooccurrence_graph.nodes: continue - elif node_tuple[1] not in cooccurence_graph.nodes: + elif node_tuple[1] not in cooccurrence_graph.nodes: continue @@ -247,25 +247,25 @@ def build_graph(node_freq_dict, edge_freq_dict): prob_0 = cooccurrence_frequency / node0_frequency prob_1 = cooccurrence_frequency / node1_frequency - #best_weight = 1 - max(prob_0, prob_1) - dice_weight = 1 - ((prob_0 + prob_1) / 2) + best_weight = 1 - max(prob_0, prob_1) + #dice_weight = 1 - ((prob_0 + prob_1) / 2) - if dice_weight <= max_weight: + if best_weight <= max_weight: - cooccurence_graph.add_edge(*node_tuple, weight=dice_weight) + cooccurrence_graph.add_edge(*node_tuple, weight=best_weight) else: pass - return cooccurence_graph + return cooccurrence_graph def root_hubs(graph, edge_freq_dict): """Identifies senses (root hubs) by choosing nodes with high degrees Selects root hubs according to the algorithm in Véronis (2004). Nodes with - high degree and neighbors with low weights (high cooccurence) are chosen + high degree and neighbors with low weights (high cooccurrence) are chosen until there are no more viable candidates. A root hub candidate is every node that is not already a hub and is not a neighbor of one. -- GitLab