From 22ddc8500b40af7f1b35da0569a21343dc28ec10 Mon Sep 17 00:00:00 2001
From: opi <opitz@cl.uni-heidelberg.de>
Date: Tue, 23 Jul 2019 15:45:54 +0200
Subject: [PATCH] added comments

---
 src/data_helpers.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/data_helpers.py b/src/data_helpers.py
index ddd9c7a..faaacbb 100644
--- a/src/data_helpers.py
+++ b/src/data_helpers.py
@@ -84,6 +84,8 @@ class HasDescriptionNode(dict):
         return all([other.d[k] == self.d[k] for k in [ "text"]])
      
     def simplify_to_direct_object(self):
+
+        #use spacy to extract direct object, useful for theft trials: "what was stolen?!"
         logging.info("before simplification: {}".format(self.d["text"]))
         doc = nlp(self.d["text"])
         #find direct object
@@ -100,6 +102,9 @@ class HasDescriptionNode(dict):
     
 
     def get_noun_chunk_vectors(self):
+        """Use spacy to extract noun chunks and corresponding word vectors from the raw text
+
+        """
         vecs=[]
         ncs = []
         doc = nlp(self.d["text"])
@@ -135,14 +140,21 @@ class HasDescriptionNode(dict):
         return vecs,ncs
 
     def simplify(self,classifier,labelindicator):
+        """Uses a predictor to extract highly correlated (with label) noun chunks from the raw text
+            
+           E.g., when label = punishment, we check the probability of punishment given a noun chunks
+        """
         vecs,ncs = self.get_noun_chunk_vectors()
         newcopy = HasDescriptionNode(None,"None")
         if not ncs:
             return newcopy
         maxlen=max([len(x) for x in ncs])
+
+        #for each noun chunk collect the probability that label= specific label
         preds = [(ncs[i],classifier.predict_proba([vec])[0][labelindicator]) for i,vec in enumerate(vecs)]
         
-        
+        #add some heuristic weights: noun chunks which appear earlier should be weighted higher
+        #and very long noun chunks should be punished
         weights = lambda x: [(1 + (1 -len(x)/maxlen)), (1/(1+x.start))]
         coefs = [0.5,0.2]
         preds = [(x,sum([y*weights(x)[i]*coefs[i] for i in range(len(coefs))] )) for x,y in preds] 
-- 
GitLab