diff --git a/src/data_helpers.py b/src/data_helpers.py index ddd9c7ae7598dcf3ccadf71c2a26e5d39cfd4c01..faaacbb424806b27d1bec90f534064c752651276 100644 --- a/src/data_helpers.py +++ b/src/data_helpers.py @@ -84,6 +84,8 @@ class HasDescriptionNode(dict): return all([other.d[k] == self.d[k] for k in [ "text"]]) def simplify_to_direct_object(self): + + #use spacy to extract direct object, useful for theft trials: "what was stolen?!" logging.info("before simplification: {}".format(self.d["text"])) doc = nlp(self.d["text"]) #find direct object @@ -100,6 +102,9 @@ class HasDescriptionNode(dict): def get_noun_chunk_vectors(self): + """Use spacy to extract noun chunks and corresponding word vectors from the raw text + + """ vecs=[] ncs = [] doc = nlp(self.d["text"]) @@ -135,14 +140,21 @@ class HasDescriptionNode(dict): return vecs,ncs def simplify(self,classifier,labelindicator): + """Uses a predictor to extract highly correlated (with label) noun chunks from the raw text + + E.g., when label = punishment, we check the probability of punishment given a noun chunks + """ vecs,ncs = self.get_noun_chunk_vectors() newcopy = HasDescriptionNode(None,"None") if not ncs: return newcopy maxlen=max([len(x) for x in ncs]) + + #for each noun chunk collect the probability that label= specific label preds = [(ncs[i],classifier.predict_proba([vec])[0][labelindicator]) for i,vec in enumerate(vecs)] - + #add some heuristic weights: noun chunks which appear earlier should be weighted higher + #and very long noun chunks should be punished weights = lambda x: [(1 + (1 -len(x)/maxlen)), (1/(1+x.start))] coefs = [0.5,0.2] preds = [(x,sum([y*weights(x)[i]*coefs[i] for i in range(len(coefs))] )) for x,y in preds]