added comments

22fe70c4 · opitz · 22ddc850 · 22fe70c4
Commit 22fe70c4 authored 5 years ago by opitz
--- a/src/graph_helpers.py
+++ b/src/graph_helpers.py
@@ -23,6 +23,11 @@ def contract_gender(G, gender="male"):
    return G,vns[0][0]
 def simplify_text_description_nodes(G,node_index_dict,mode="None",min_freq=1):
+    """function takes our graph and simplifies text description nodes
+       E.g., felouneously stealing, on the 10th Decembre,  two silver watches ----> watches
+    """
    if mode == "None":
        return G, node_index_dict
    #collect all descriptions and their neighbor category
@@ -32,32 +37,42 @@ def simplify_text_description_nodes(G,node_index_dict,mode="None",min_freq=1):
    trialnodes=[n for n in G.nodes(data=True) if isinstance(n[1]["nodeobj"],dh.TrialNode)]
    descr_nodes=[]
    mask=[]
+    # we iterate over all trials
    for i,tn in enumerate(trialnodes):
        #get corresponding cat node
        catn = [n for n in G.neighbors(tn[0]) if isinstance(G.nodes[n]["nodeobj"],dh.OffenceNode)][0]
        catn=[catn,G.nodes[catn]]
-        #print(catn)
-        #catn=[cat]
        category=catn[1]["nodeobj"].d["category"]
        descr_vectors = None
        tid = None
+        # we iterate over all neighbors of the trial
        for nb in G[tn[0]]:
            for edge_id in G[tn[0]][nb]:
+                # and grab nodes which describe a offence
                if G[tn[0]][nb][edge_id]["edge_class"] == "with-offence-description":
-                    #print(G.nodes[nb])
+                    # we collect the noun chunk vectors
                    descr_vectors,_ = G.nodes[nb]["nodeobj"].get_noun_chunk_vectors()
                    descr_nodes.append(G.nodes[nb]["nodeobj"])
-                    #descr_nodes[-1].simplify_to_direct_object()
                    tid=(tn[0],nb)  
                    Xid.append(tid)
                    for dv in descr_vectors:
+                        #put noun chunk vector into training data
                        Xvector.append(dv)
+                        #put label into training data
                        related_cat.append(category)
    if mode=="classifier":
+        #fit a classifier to learn a mapping between noun chunks and labels
        clf=LogisticRegression()
        clf.fit(Xvector,related_cat)
+    # now we can remove the textdescription nodes and insert their simplified fporms
    for i,idx in enumerate(Xid):
        if idx[1] in G:
            G.remove_node(idx[1])
            #node_index_dict.pop(descr_nodes[i])