Remade program with NetworkX, terrible results so far, but completely implemented

c91d862e · Victor Zimmermann · 6232f19b · c91d862e
Commit c91d862e authored 7 years ago by Victor Zimmermann
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
+import os # for reading files
+import sys
+import spacy # for nlp
+import networkx as nx # for visualisation
+import matplotlib.pyplot as plt # for visualisation
+from copy import deepcopy
+import numpy as np # for calculations
+nlp = spacy.load('en') # standard english nlp
+try:
+    from tqdm import tqdm # for counting seconds
+except:
+    tqdm = lambda x: x
+
+def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4):
+    
+    node_freq = dict()
+    edge_freq = dict()
+    
+    files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
+    s_target = target.replace('_', ' ') #target word with spaces
+    
+    for f in tqdm(files[:]):
+        
+        with open(f, 'r') as lines:
+            
+            try:
+                
+                for line in lines:
+                    
+                    line = line.lower()
+                    
+                    if s_target in line:
+                        
+                        tokens = set()
+                        doc = nlp(line.replace(s_target, target))
+                        
+                        if target in [t.text for t in doc]:
+                            
+                            for tok in doc:
+                                
+                                text = tok.text
+                                tag = tok.tag_
+                                
+                                if text == target:
+                                    pass
+                                
+                                elif text in stop_words:
+                                    pass
+                                
+                                elif tag in allowed_tags:
+                                    tokens.add(tok.text)
+                                    
+                            if len(tokens) >= min_context_size:
+                                for token in tokens:
+                                    
+                                    if token in node_freq:
+                                        node_freq[token] += 1
+                                    else:
+                                        node_freq[token] = 1
+                                
+                                for edge in {(x,y) for x in tokens for y in tokens if x != y}:
+                                    
+                                    if edge in edge_freq:
+                                        edge_freq[edge] += 1
+                                    else:
+                                        edge_freq[edge] = 1
+            
+            except UnicodeDecodeError:
+                
+                pass
+                #print('Failed to decode:', f)              
+                
+    return node_freq, edge_freq
+
+
+def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_weight= 0.9):
+    
+    G = nx.Graph()
+    
+    for key, value in tqdm(node_freq.items()):
+        
+        if value >= min_node_freq:
+            G.add_node(key)
+            
+    for key, value in tqdm(edge_freq.items()):
+        
+        if value < min_edge_freq:
+            continue
+        
+        if key[0] not in G.nodes or key[1] not in G.nodes:
+            continue
+        
+        weight = 1 - max(edge_freq[key]/node_freq[key[0]], edge_freq[key]/node_freq[key[1]])
+        if weight <= max_weight:
+            G.add_edge(*key, weight=weight)
+    
+    return G
+
+
+def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
+    
+    G = deepcopy(graph)
+    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
+    H = list()
+    
+    while V:
+        
+        v = V[0]
+        
+        if G.degree[v] >= min_neighbors:
+        
+            mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
+            
+            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold:
+                
+                H.append(v)
+            
+                for nbr in deepcopy(G).adj[v]:
+                
+                    G.remove_node(nbr)
+                
+            G.remove_node(v)
+            
+            V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True)
+        
+        else:
+        
+            return H
+    
+    return H
+
+#Components algorithm from Véronis (2004), converts graph for target into a MST
+def components(graph, hubs, target):
+    G = deepcopy(graph)
+    H = hubs
+    t = target
+    
+    G.add_node(t)
+    for h in H:
+        G.add_edge(t,h,weight=0)
+        
+    T = nx.minimum_spanning_tree(G)
+    
+    for node in deepcopy(T).nodes:
+        if len(T.adj[node]) == 0:
+            T.remove_node(node)
+    
+    return T
+
+
+#Uses MST to disambiguate context, should ideally write to evaluator format
+def disambiguate(mst, hubs, contexts):
+    T = mst
+    H = hubs
+    i = 1
+    cluster = []
+    
+    for v in list(T.nodes):
+        
+        weights = []
+        
+        for h in H:
+            
+            try:
+                path = nx.shortest_path(T,v,h,'weight')
+                total_weight = 0
+            
+                for i in range(1, len(path)):
+                    total_weight += T[path[i-1]][path[i]]['weight']
+            
+                weights.append(1/(1+total_weight))
+                
+            except:
+                weights.append(0)
+        
+        T.nodes[v]['s'] = np.array([w if w == max(weights) else 0 for w in weights])
+    
+    for c in contexts:
+        
+        toks = [t.text for t in nlp(c)]
+        vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
+        
+        try:
+            cluster.append((np.argmax(vector), i))
+        except:
+            cluster.append((len(H), i))
+        
+        i += 1
+    
+    return cluster
+
+if __name__ == '__main__':
+    
+    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
+    #corpus_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
+    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
+    
+    results = dict()
+    with open(data_path+'/results.txt', 'r') as results_file:
+        for line in results_file.readlines()[1:]:
+            l = line.split('\t')
+            id1, _ = l[0].split('.')
+            if id1 not in results:
+                results[id1]=list()
+            results[id1].append(" ".join(l[2:]))
+            
+    topics = dict()
+    with open(data_path+'/topics.txt', 'r') as topics_file:
+        for line in topics_file.readlines()[1:]:
+            l = line.split('\t')
+            topics[l[0]] = l[1]
+        
+    with open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/test.txt', 'w') as clusters:
+        
+        clusters.write('subTopicID\tresultID\n')
+        
+        for key, value in tqdm(topics.items()):
+            
+            target = value.strip()
+            print(target)
+            node_freq, edge_freq = frequencies(corpus_path, target)
+            G = build_graph(node_freq, edge_freq)
+            H = root_hubs(G, edge_freq)
+            T = components(G, H, target)
+        
+            D = disambiguate(T, H, results[key])
+            print(D)
+            for d in D:
+                clusters.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
+    
+    #target = sys.argv[1]
+    
+    #node_freq, edge_freq = frequencies(corpus_path, target)
+    
+    #G = build_graph(node_freq, edge_freq) #initialises graph
+    
+    #H = root_hubs(G, edge_freq)
+    
+    #T = components(G, H, target)
+    
+    #print(node_freq)
+    
+    #for node in deepcopy(T).nodes:
+        #if len(T.adj[node]) == 0:
+            #T.remove_node(node)
+    
+    #nx.draw(T, with_labels=True)
+    #plt.show()
+        
+    
+        
+    #G.view()
+    #print(G.find_path('english', 'kennel'))
+    #G.draw() #draws graph