diff --git a/src/data_helpers.py b/src/data_helpers.py index e240acbdf79df9cbfb3188cce9a3e0a2ae8efe04..faaacbb424806b27d1bec90f534064c752651276 100644 --- a/src/data_helpers.py +++ b/src/data_helpers.py @@ -84,6 +84,8 @@ class HasDescriptionNode(dict): return all([other.d[k] == self.d[k] for k in [ "text"]]) def simplify_to_direct_object(self): + + #use spacy to extract direct object, useful for theft trials: "what was stolen?!" logging.info("before simplification: {}".format(self.d["text"])) doc = nlp(self.d["text"]) #find direct object @@ -100,6 +102,9 @@ class HasDescriptionNode(dict): def get_noun_chunk_vectors(self): + """Use spacy to extract noun chunks and corresponding word vectors from the raw text + + """ vecs=[] ncs = [] doc = nlp(self.d["text"]) @@ -135,14 +140,21 @@ class HasDescriptionNode(dict): return vecs,ncs def simplify(self,classifier,labelindicator): + """Uses a predictor to extract highly correlated (with label) noun chunks from the raw text + + E.g., when label = punishment, we check the probability of punishment given a noun chunks + """ vecs,ncs = self.get_noun_chunk_vectors() newcopy = HasDescriptionNode(None,"None") if not ncs: return newcopy maxlen=max([len(x) for x in ncs]) + + #for each noun chunk collect the probability that label= specific label preds = [(ncs[i],classifier.predict_proba([vec])[0][labelindicator]) for i,vec in enumerate(vecs)] - + #add some heuristic weights: noun chunks which appear earlier should be weighted higher + #and very long noun chunks should be punished weights = lambda x: [(1 + (1 -len(x)/maxlen)), (1/(1+x.start))] coefs = [0.5,0.2] preds = [(x,sum([y*weights(x)[i]*coefs[i] for i in range(len(coefs))] )) for x,y in preds] @@ -168,20 +180,16 @@ class HasDescriptionNode(dict): class HasCategoryNode(dict): - def __init__(self,elm,cat,subcat=False,dummy=False,text=False): + def __init__(self,elm,cat,subcat=False,dummy=False): self.d = {} if dummy: self.d["category"] ="DUMMYCAT" if subcat: self.d["subcategory"] ="DUMMYSUBCAT" - if text: - self.d["text"] = "DUMMYTEXT" else: self.d["category"] = maybe_first_value(elm.findAll("interp",{"type":cat+"Category"})) if subcat: self.d["subcategory"] = maybe_first_value(elm.findAll("interp",{"type":subcat+"Subcategory"})) - if text: - self.d["text"] = get_text(elm) dict.__init__(self, self.d) def __eq__(self, other): diff --git a/src/graph_builder.py b/src/graph_builder.py index 91ca253b88c896cebd1b5a23f45f1d99e32caad3..46204f9686f61e6e9224e4090543521a428dc12d 100644 --- a/src/graph_builder.py +++ b/src/graph_builder.py @@ -9,15 +9,19 @@ from operator import itemgetter import graph_helpers as gh import logging -subcat=False -text=False -def build_graph(years=[],include_utterances=False): +def build_graph(years=[],include_utterances=False,subcat=False): + #a small helper function + def maybe_subcat(string): + if not subcat: + return False + else: return string + # we intialize an empty graph G = nx.MultiGraph() - + #index to keep track of nodes nodeindex=0 @@ -33,16 +37,11 @@ def build_graph(years=[],include_utterances=False): #and the year year = exact_date[:4] - + #add a trial node trialnode = dh.TrialNode(exact_date,fp+"_"+trial_index_infile) nodeindex,trialindex = gh.maybe_new_index(trialnode,node_index_dict) G.add_node(trialindex, label="Trial", nodeobj=trialnode) - - #yearnode = dh.YearNode(year) - #nodeindex,yindex = gh.maybe_new_index(yearnode,node_index_dict) - #G.add_node(yindex, label="Year", nodeobj=yearnode) - #G.add_edge(trialindex,yindex,edge_class="in-year") @@ -51,18 +50,16 @@ def build_graph(years=[],include_utterances=False): #offs = [dh.create_cat_elm(of,"offence") for of in offs] descriptions = [dh.HasDescriptionNode(of) for of in offs] logging.info("descriptions found: {}".format(descriptions)) - #[desc.simplify_to_direct_object() for desc in descriptions] - offs = [dh.OffenceNode(of,"offence",subcat=subcat,dummy=False,text=text) for of in offs] - #print(descriptions) - #asd + offs = [dh.OffenceNode(of,"offence",subcat=maybe_subcat("offence"),dummy=False) for of in offs] + for i,off in enumerate(offs): - #each offence has a text and a (symbolic) category + #add offence node nodeindex,offindex = gh.maybe_new_index(off,node_index_dict) G.add_node(offindex, label="Offence", nodeobj=off) G.add_edge(trialindex,offindex,edge_class="with-offence") - #nodeindex+=1 - + + # add textual description node nodeindex,descr_index = gh.maybe_new_index(descriptions[i],node_index_dict) G.add_node(descr_index, label="Description", nodeobj=descriptions[i]) G.add_edge(trialindex,descr_index,edge_class="with-offence-description") @@ -70,10 +67,10 @@ def build_graph(years=[],include_utterances=False): #similar to offences vs = div.findAll("rs",{"type":"verdictDescription"}) - vs = [dh.VerdictNode(v,"verdict",subcat=subcat,dummy=False,text=text) for v in vs] + vs = [dh.VerdictNode(v,"verdict",subcat=maybe_subcat("verdict"),dummy=False) for v in vs] if not vs: logging.warning("warning no verdict found inserting dummy") - vn=dh.VerdictNode({},"",subcat=subcat,dummy=True,text=False) + vn=dh.VerdictNode({},"",subcat=maybe_subcat("dummy"),dummy=True,text=False) nodeindex,vindex = gh.maybe_new_index(vn,node_index_dict) G.add_node(vindex, label="Verdict", nodeobj=vn) G.add_edge(trialindex,vindex,edge_class="with-verdict") @@ -86,7 +83,7 @@ def build_graph(years=[],include_utterances=False): # similar to offences puns = div.findAll("rs",{"type":"punishmentDescription"}) - puns = [dh.PunishmentNode(pun,"punishment",subcat=subcat,dummy=False,text=text) for pun in puns] + puns = [dh.PunishmentNode(pun,"punishment",subcat=maybe_subcat("punishment"),dummy=False) for pun in puns] for pun in puns: nodeindex,punindex = gh.maybe_new_index(pun,node_index_dict) @@ -101,6 +98,7 @@ def build_graph(years=[],include_utterances=False): ds = [dh.NamedEntityNode(dname) for dname in dnames] for d in ds: + #inser a defendant node and connect it to trial nodeindex,dindex = gh.maybe_new_index(d,node_index_dict) G.add_node(dindex, label=d.get_fullname(), nodeobj=d) G.add_edge(trialindex,dindex,edge_class="with-defendant") @@ -112,7 +110,7 @@ def build_graph(years=[],include_utterances=False): G.add_node(victindex, label=vict.get_fullname(), nodeobj=vict) G.add_edge(trialindex,victindex,edge_class="with-victim") - #we get the text from the trial account to extract named entities and utterances + #we get the text from the trial account to extract various other named entities and utterances ps = div.findAll("p") ps = [p for p in ps if len(p.findAll("persname")) == 1 and len(p.findAll("u"))] ents = [ p.findAll("persname")[0] for p in ps] diff --git a/src/graph_helpers.py b/src/graph_helpers.py index 359f92e585e3f5689a389b0ad29bca60163e49f1..581c0d1e34628382dedc8bd1368fed3cc59c3d31 100644 --- a/src/graph_helpers.py +++ b/src/graph_helpers.py @@ -23,6 +23,11 @@ def contract_gender(G, gender="male"): return G,vns[0][0] def simplify_text_description_nodes(G,node_index_dict,mode="None",min_freq=1): + """function takes our graph and simplifies text description nodes + E.g., felouneously stealing, on the 10th Decembre, two silver watches ----> watches + + """ + if mode == "None": return G, node_index_dict #collect all descriptions and their neighbor category @@ -32,32 +37,42 @@ def simplify_text_description_nodes(G,node_index_dict,mode="None",min_freq=1): trialnodes=[n for n in G.nodes(data=True) if isinstance(n[1]["nodeobj"],dh.TrialNode)] descr_nodes=[] mask=[] + + # we iterate over all trials for i,tn in enumerate(trialnodes): #get corresponding cat node catn = [n for n in G.neighbors(tn[0]) if isinstance(G.nodes[n]["nodeobj"],dh.OffenceNode)][0] catn=[catn,G.nodes[catn]] - #print(catn) - #catn=[cat] category=catn[1]["nodeobj"].d["category"] descr_vectors = None tid = None + # we iterate over all neighbors of the trial for nb in G[tn[0]]: for edge_id in G[tn[0]][nb]: + + # and grab nodes which describe a offence if G[tn[0]][nb][edge_id]["edge_class"] == "with-offence-description": - #print(G.nodes[nb]) + + # we collect the noun chunk vectors descr_vectors,_ = G.nodes[nb]["nodeobj"].get_noun_chunk_vectors() descr_nodes.append(G.nodes[nb]["nodeobj"]) - #descr_nodes[-1].simplify_to_direct_object() tid=(tn[0],nb) Xid.append(tid) for dv in descr_vectors: + #put noun chunk vector into training data Xvector.append(dv) + #put label into training data related_cat.append(category) if mode=="classifier": + #fit a classifier to learn a mapping between noun chunks and labels clf=LogisticRegression() clf.fit(Xvector,related_cat) + + + # now we can remove the textdescription nodes and insert their simplified fporms for i,idx in enumerate(Xid): + if idx[1] in G: G.remove_node(idx[1]) #node_index_dict.pop(descr_nodes[i]) diff --git a/src/main.py b/src/main.py index 5f9429e8fd02d5b30ed32f0feeff3c1665db7295..05a841acf01adf810822c06ee519de79d51c2a6b 100644 --- a/src/main.py +++ b/src/main.py @@ -26,6 +26,9 @@ parser.add_argument('-verbose',choices=[0,1,2],type=int,default=0, parser.add_argument('-prune_text_nodes_min_freq',type=int,default=2, help='how often a text description node has to occur to be included in the graph') + +parser.add_argument('--subcategories',dest='subcat',action='store_true', + help='do we want subcategories (e.g. category punishment: corporal, subcategory: whipping), there will be many more nodes with subcat enabled') args = parser.parse_args() @@ -42,7 +45,7 @@ def write(path,G): f.write(json.dumps(nx.readwrite.json_graph.node_link_data(G))) -G,index_dict = gb.build_graph(years=args.year) +G,index_dict = gb.build_graph(years=args.year,subcat=args.subcat) print("Graph info before text description node simplification:") print(nx.info(G)) gh.simplify_text_description_nodes(G,index_dict,mode=args.text_node_simplification_mode,min_freq=args.prune_text_nodes_min_freq) diff --git a/visualization/README.md b/visualization/README.md index 0ac1edb660fc82a083e4fdfb81eb8514ee7ee9e2..62f8dfde9f2b2191efa845e02aa86655545ff5de 100644 --- a/visualization/README.md +++ b/visualization/README.md @@ -6,4 +6,28 @@ This directory contains the visualization suite for the workshop ""Providing new 1. Active internet connection as some of the dependencies are loaded via CDNs 2. If you use _Firefox_, just open the file index.html in it 3. If you use a different browser, you need to set up a simple web server; with Python 3, you just need to run "python -m http.server" in this directory - * Per default, this should serve the app on port 8000 of localhost (access via 0.0.0.0:8000 or localhost:8000 in your browser) \ No newline at end of file + * Per default, this should serve the app on port 8000 of localhost (access via 0.0.0.0:8000 or localhost:8000 in your browser) + +## How to use the visualization suite +After accessing the suite in your browser, upload a graph JSON file using drag-and-drop and by clicking "Upload". + +### Available settings (settings view) +Before actually starting the visualization, you can select the following options: + +* _Visualize degree of nodes_: A node with more connections is shown larger than a node with fewer connections. (Default: false) +* _Add time slider_: Adds a slider that allows to filter graph based on dates. (Default: false) +* _Drop description nodes_: Do not display text description nodes (useful on very large and/or unsimplified graphs) (Default: false) +* _Minimum count of connections for persons_: Only show persons and associated nodes if those persons are connected to at least the set number of trials (Default: 1) + +### Available settings (visualization view) +After visualization is complete, you can interact with the graph in the following ways to inspect it more closely or to filter out specfic parts: + +* **Settings menu** (gear symbol on the top-left): + * _Enable Physics_: Enables the physics module of _vis.js_, thus makingg the network move around and react to dragging of nodes (Default: false) + * _Lock network_: Other than zooming and dragging nodes, the network becomes unresponsive to user interaction. Ideal when selecting a node and inspecting its connections without accidentally unselecting it. (Default: false) + * _Node Types_: Lists all special node types (see below for details) with their associated shape and color. You can highlight each node type individually. Further, you can remove all description nodes. + * _Minimum count of connections for persons_: see above + * _Map node types_: Allows you to transform the graph by mapping two node types that have no direct connection onto each other by way of checking how many trials they are mututally associated with. + +### Special node types +... \ No newline at end of file