added comments

22ddc850 · opitz · 21294cf9 · 22ddc850
Commit 22ddc850 authored 5 years ago by opitz
--- a/src/data_helpers.py
+++ b/src/data_helpers.py
@@ -84,6 +84,8 @@ class HasDescriptionNode(dict):
        return all([other.d[k] == self.d[k] for k in [ "text"]])
     
    def simplify_to_direct_object(self):
+
+        #use spacy to extract direct object, useful for theft trials: "what was stolen?!"
        logging.info("before simplification: {}".format(self.d["text"]))
        doc = nlp(self.d["text"])
        #find direct object
@@ -100,6 +102,9 @@ class HasDescriptionNode(dict):
    

    def get_noun_chunk_vectors(self):
+        """Use spacy to extract noun chunks and corresponding word vectors from the raw text
+
+        """
        vecs=[]
        ncs = []
        doc = nlp(self.d["text"])
@@ -135,14 +140,21 @@ class HasDescriptionNode(dict):
        return vecs,ncs

    def simplify(self,classifier,labelindicator):
+        """Uses a predictor to extract highly correlated (with label) noun chunks from the raw text
+            
+           E.g., when label = punishment, we check the probability of punishment given a noun chunks
+        """
        vecs,ncs = self.get_noun_chunk_vectors()
        newcopy = HasDescriptionNode(None,"None")
        if not ncs:
            return newcopy
        maxlen=max([len(x) for x in ncs])
+
+        #for each noun chunk collect the probability that label= specific label
        preds = [(ncs[i],classifier.predict_proba([vec])[0][labelindicator]) for i,vec in enumerate(vecs)]
        
-        
+        #add some heuristic weights: noun chunks which appear earlier should be weighted higher
+        #and very long noun chunks should be punished
        weights = lambda x: [(1 + (1 -len(x)/maxlen)), (1/(1+x.start))]
        coefs = [0.5,0.2]
        preds = [(x,sum([y*weights(x)[i]*coefs[i] for i in range(len(coefs))] )) for x,y in preds]