Skip to content
Snippets Groups Projects
Commit 22ddc850 authored by opitz's avatar opitz
Browse files

added comments

parent 21294cf9
No related branches found
No related tags found
No related merge requests found
......@@ -84,6 +84,8 @@ class HasDescriptionNode(dict):
return all([other.d[k] == self.d[k] for k in [ "text"]])
def simplify_to_direct_object(self):
#use spacy to extract direct object, useful for theft trials: "what was stolen?!"
logging.info("before simplification: {}".format(self.d["text"]))
doc = nlp(self.d["text"])
#find direct object
......@@ -100,6 +102,9 @@ class HasDescriptionNode(dict):
def get_noun_chunk_vectors(self):
"""Use spacy to extract noun chunks and corresponding word vectors from the raw text
"""
vecs=[]
ncs = []
doc = nlp(self.d["text"])
......@@ -135,14 +140,21 @@ class HasDescriptionNode(dict):
return vecs,ncs
def simplify(self,classifier,labelindicator):
"""Uses a predictor to extract highly correlated (with label) noun chunks from the raw text
E.g., when label = punishment, we check the probability of punishment given a noun chunks
"""
vecs,ncs = self.get_noun_chunk_vectors()
newcopy = HasDescriptionNode(None,"None")
if not ncs:
return newcopy
maxlen=max([len(x) for x in ncs])
#for each noun chunk collect the probability that label= specific label
preds = [(ncs[i],classifier.predict_proba([vec])[0][labelindicator]) for i,vec in enumerate(vecs)]
#add some heuristic weights: noun chunks which appear earlier should be weighted higher
#and very long noun chunks should be punished
weights = lambda x: [(1 + (1 -len(x)/maxlen)), (1/(1+x.start))]
coefs = [0.5,0.2]
preds = [(x,sum([y*weights(x)[i]*coefs[i] for i in range(len(coefs))] )) for x,y in preds]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment