From 6ba4aad4cb062ebd13172771b917b381e0fd4d8c Mon Sep 17 00:00:00 2001
From: zimmermann <zimmermann@cl.uni-heidelberg.de>
Date: Sun, 18 Mar 2018 19:13:37 +0100
Subject: [PATCH] Spelling and lemmatisaton.

---
 src/absinth.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/absinth.py b/src/absinth.py
index b51a546..f9bbc22 100644
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -168,7 +168,7 @@ def process_file(context_list, target_string, node_freq_dict, edge_freq_dict):
                         
                         # Add only tokens with allowed tags to nodes.
                         elif token.tag_ in allowed_tag_list:
-                            token_set.add(token.text)
+                            token_set.add(token.lemma_)
                             
                     context_size = len(token_set)
                     
@@ -209,20 +209,20 @@ def build_graph(node_freq_dict, edge_freq_dict):
             tokens within every context the target occurs in.
     
     Returns:
-        cooccurence_graph: Filtered undirected dice weighted small word 
-            cooccurence graph for a given target entity.
+        cooccurrence_graph: Filtered undirected dice weighted small word 
+            cooccurrence graph for a given target entity.
     """
     
     min_node_freq = config.min_node_freq
     min_edge_freq = config.min_edge_freq
     max_weight = config.max_weight
     
-    cooccurence_graph = nx.Graph()
+    cooccurrence_graph = nx.Graph()
     
     for node, frequency in node_freq_dict.items():
         
         if frequency >= min_node_freq:
-            cooccurence_graph.add_node(node)
+            cooccurrence_graph.add_node(node)
             
     for node_tuple, frequency in edge_freq_dict.items():
         
@@ -230,11 +230,11 @@ def build_graph(node_freq_dict, edge_freq_dict):
             
             continue
         
-        elif node_tuple[0] not in cooccurence_graph.nodes:
+        elif node_tuple[0] not in cooccurrence_graph.nodes:
             
             continue
         
-        elif node_tuple[1] not in cooccurence_graph.nodes:
+        elif node_tuple[1] not in cooccurrence_graph.nodes:
             
             continue
         
@@ -247,25 +247,25 @@ def build_graph(node_freq_dict, edge_freq_dict):
             prob_0 = cooccurrence_frequency / node0_frequency
             prob_1 = cooccurrence_frequency / node1_frequency
             
-            #best_weight = 1 - max(prob_0, prob_1)
-            dice_weight = 1 - ((prob_0 + prob_1) / 2)
+            best_weight = 1 - max(prob_0, prob_1)
+            #dice_weight = 1 - ((prob_0 + prob_1) / 2)
             
-            if dice_weight <= max_weight:
+            if best_weight <= max_weight:
                 
-                cooccurence_graph.add_edge(*node_tuple, weight=dice_weight)
+                cooccurrence_graph.add_edge(*node_tuple, weight=best_weight)
             
             else:
                 
                 pass
     
-    return cooccurence_graph
+    return cooccurrence_graph
 
 
 def root_hubs(graph, edge_freq_dict):
     """Identifies senses (root hubs) by choosing nodes with high degrees
     
     Selects root hubs according to the algorithm in Véronis (2004). Nodes with
-    high degree and neighbors with low weights (high cooccurence) are chosen
+    high degree and neighbors with low weights (high cooccurrence) are chosen
     until there are no more viable candidates. A root hub candidate is every
     node that is not already a hub and is not a neighbor of one.
     
-- 
GitLab