Added max contraints on edges and nodes, as well as several print statements.

c24c8230 · Victor Zimmermann · c91d862e · c24c8230
Commit c24c8230 authored 7 years ago by Victor Zimmermann
--- a/code/absinth_nx.py
+++ b/code/absinth_nx.py
 import os # for reading files
 import sys
+print('[A] Loading ' + sys.argv[0] + '.\n')
 import spacy # for nlp
 import networkx as nx # for visualisation
 import matplotlib.pyplot as plt # for visualisation
 from copy import deepcopy
 import numpy as np # for calculations
 nlp = spacy.load('en') # standard english nlp
-try:
-    from tqdm import tqdm # for counting seconds
-except:
-    tqdm = lambda x: x

-def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4):
+def frequencies(corpus_path, target, stop_words=['utc', 'new', 'other'], allowed_tags=['NN','NNS','JJ','JJS','JJR','NNP'], min_context_size = 4, max_nodes=10000, max_edges=1000000):
    
    node_freq = dict()
    edge_freq = dict()
@@ -19,7 +16,21 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N
    files = [corpus_path+'/'+f for f in os.listdir(corpus_path)]
    s_target = target.replace('_', ' ') #target word with spaces
    
-    for f in tqdm(files[:]):
+    i = 0
+    for f in files[:]:
+        
+        if i % int(len(files[:])/23) == 0:
+            file_ratio = i/len(files[:])
+            max_node_ratio = len(node_freq)/max_nodes
+            max_edge_ratio = len(edge_freq)/max_edges
+            ratios = [file_ratio, max_node_ratio, max_edge_ratio]
+            print('~ {}%\tNodes: {}\tEdges: {}.'.format(int((max(ratios))*100), len(node_freq), len(edge_freq)))
+        
+        if len(node_freq) > max_nodes:
+            return node_freq, edge_freq
+        
+        if len(edge_freq) > max_edges:
+            return node_freq, edge_freq
        
        with open(f, 'r') as lines:
            
@@ -58,7 +69,7 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N
                                    else:
                                        node_freq[token] = 1
                                
-                                for edge in {(x,y) for x in tokens for y in tokens if x != y}:
+                                for edge in {(x,y) for x in tokens for y in tokens if x < y}:
                                    
                                    if edge in edge_freq:
                                        edge_freq[edge] += 1
@@ -69,7 +80,9 @@ def frequencies(corpus_path, target, stop_words=['utc', 'new'], allowed_tags=['N
                
                pass
                #print('Failed to decode:', f)              
-                
+        
+        i += 1
+        
    return node_freq, edge_freq


@@ -77,12 +90,12 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
    
    G = nx.Graph()
    
-    for key, value in tqdm(node_freq.items()):
+    for key, value in node_freq.items():
        
        if value >= min_node_freq:
            G.add_node(key)
            
-    for key, value in tqdm(edge_freq.items()):
+    for key, value in edge_freq.items():
        
        if value < min_edge_freq:
            continue
@@ -97,7 +110,7 @@ def build_graph(node_freq, edge_freq, min_node_freq=10, min_edge_freq=5, max_wei
    return G


-def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
+def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
    
    G = deepcopy(graph)
    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
@@ -109,7 +122,7 @@ def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):
        
        if G.degree[v] >= min_neighbors:
        
-            mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
+            mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
            
            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold:
                
@@ -131,13 +144,14 @@ def root_hubs(graph, edge_freq, min_neighbors=6, theshold=0.8):

 #Components algorithm from Véronis (2004), converts graph for target into a MST
 def components(graph, hubs, target):
+    
    G = deepcopy(graph)
    H = hubs
    t = target
    
-    G.add_node(t)
-    for h in H:
-        G.add_edge(t,h,weight=0)
+    #G.add_node(t)
+    #for h in H:
+        #G.add_edge(t,h,weight=0)
        
    T = nx.minimum_spanning_tree(G)
    
@@ -150,14 +164,14 @@ def components(graph, hubs, target):

 #Uses MST to disambiguate context, should ideally write to evaluator format
 def disambiguate(mst, hubs, contexts):
+    
    T = mst
    H = hubs
-    i = 1
    cluster = []
    
    for v in list(T.nodes):
        
-        weights = []
+        scores = []
        
        for h in H:
            
@@ -168,27 +182,29 @@ def disambiguate(mst, hubs, contexts):
                for i in range(1, len(path)):
                    total_weight += T[path[i-1]][path[i]]['weight']
            
-                weights.append(1/(1+total_weight))
+                scores.append(1/(1+total_weight))
                
            except:
-                weights.append(0)
+                scores.append(0)
        
-        T.nodes[v]['s'] = np.array([w if w == max(weights) else 0 for w in weights])
+        T.nodes[v]['s'] = np.array([s if s == max(scores) else 0 for s in scores])
    
    for c in contexts:
        
        toks = [t.text for t in nlp(c)]
        vector = np.sum([T.nodes[t]['s'] if t in T.nodes else np.zeros(len(H)) for t in toks], axis=0)
        
+        idx = contexts.index(c) + 1
+        
        try:
-            cluster.append((np.argmax(vector), i))
+            cluster.append((np.argmax(vector), idx))
        except:
-            cluster.append((len(H), i))
-        
-        i += 1
+            cluster.append((len(H), idx))
    
    return cluster

+
+
 if __name__ == '__main__':
    
    data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
@@ -196,59 +212,60 @@ if __name__ == '__main__':
    corpus_path = '/proj/absinth/wikipedia.txt.dump.20140615-en.SZTAKI'
    
    results = dict()
+    
    with open(data_path+'/results.txt', 'r') as results_file:
+        
        for line in results_file.readlines()[1:]:
+            
            l = line.split('\t')
            id1, _ = l[0].split('.')
+            
            if id1 not in results:
                results[id1]=list()
+                
            results[id1].append(" ".join(l[2:]))
            
+            
    topics = dict()
+    
    with open(data_path+'/topics.txt', 'r') as topics_file:
+        
        for line in topics_file.readlines()[1:]:
+            
            l = line.split('\t')
            topics[l[0]] = l[1]
        
-    with open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/test.txt', 'w') as clusters:
+    for key, value in topics.items():
+            
+        target = value.strip()
+        print("[A] Processing '"+target+"'.\n")
        
-        clusters.write('subTopicID\tresultID\n')
+        f = open('/home/students/zimmermann/Courses/ws17/fsem/absinth/results/'+target+'.absinth', 'w')
+        f.write('subTopicID\tresultID\n')
        
-        for key, value in tqdm(topics.items()):
-            
-            target = value.strip()
-            print(target)
-            node_freq, edge_freq = frequencies(corpus_path, target)
-            G = build_graph(node_freq, edge_freq)
-            H = root_hubs(G, edge_freq)
-            T = components(G, H, target)
+        print('[A] Counting Tokens...')
+        node_freq, edge_freq = frequencies(corpus_path, target)
        
-            D = disambiguate(T, H, results[key])
-            print(D)
-            for d in D:
-                clusters.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
-    
-    #target = sys.argv[1]
-    
-    #node_freq, edge_freq = frequencies(corpus_path, target)
-    
-    #G = build_graph(node_freq, edge_freq) #initialises graph
-    
-    #H = root_hubs(G, edge_freq)
-    
-    #T = components(G, H, target)
-    
-    #print(node_freq)
+        print('\n[A] Building Graph.\n')
+        G = build_graph(node_freq, edge_freq)
+        
+        print('[A] Collecting Root Hubs...')
+        H = root_hubs(G, edge_freq)
+        print('Root Hubs:', H, '\n')
+        
+        print('[A] Building Minimum Spanning Tree.\n')
+        T = components(G, H, target)
    
-    #for node in deepcopy(T).nodes:
-        #if len(T.adj[node]) == 0:
-            #T.remove_node(node)
+        print('[A] Disambiguating Results...')
+        D = disambiguate(T, H, results[key])
+        print('Mapping:', D, '\n')
+        
+        print('[A] Writing to file '+target+'.absinth.\n\n')
+        for d in D:
+            
+            f.write(key+'.'+str(d[0])+'\t'+key+'.'+str(d[1])+'\n')
+            
+        f.close()
    
    #nx.draw(T, with_labels=True)
    #plt.show()
-        
-    
-        
-    #G.view()
-    #print(G.find_path('english', 'kennel'))
-    #G.draw() #draws graph