Add comments, fixed some bugs, other minor updates.

d4b72c96 · Victor Zimmermann · 2de73dc7 · d4b72c96
Commit d4b72c96 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -12,6 +12,7 @@ from multiprocessing import Pool
 nlp = spacy.load('en') # standard english nlp


+#counts occurences of nodes and cooccurrences
 def frequencies(corpus_path, target):
    
    stop_words = set(stopwords.words('english') + config.stop_words)
@@ -20,16 +21,16 @@ def frequencies(corpus_path, target):
    max_nodes = config.max_nodes
    max_edges = config.max_edges
    
-    node_freq = dict()
-    edge_freq = dict()
+    node_freq = dict() #counts (potential) nodes
+    edge_freq = dict() #counts (potential) edges
    
-    files = [corpus_path + f for f in os.listdir(corpus_path)]
+    files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
    s_target = target.replace('_', ' ') #target word with spaces
    
-    i = 0
+    i = 0 #for update print statements
    for f in files:
        
-        if i % int(len(files)/10) == 0:
+        if i % int(len(files)/10) == 0: #prints update after every 10th of the corpus is parsed
            
            file_ratio = i/len(files[:])
            max_node_ratio = len(node_freq)/max_nodes
@@ -37,48 +38,56 @@ def frequencies(corpus_path, target):
            
            ratios = [file_ratio, max_node_ratio, max_edge_ratio]
            
+            #uses the ratio closest to 100%.
            percentage = int((max(ratios))*100)
            
            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target)
        
+        #checks maximum node values
        if len(node_freq) > max_nodes:
            return node_freq, edge_freq
        
+        #checks maximum edge values
        if len(edge_freq) > max_edges:
            return node_freq, edge_freq
        
-        with open(f, 'r') as lines:
+        with open(f, 'r') as lines: #parses single file
            
            try:
                
-                for line in lines:
+                for line in lines: #parses single paragraph
                    
                    line = line.lower()
                    
-                    if s_target in line:
+                    if s_target in line: #greedy pre selection, not perfect
                        
-                        tokens = set()
-                        doc = nlp(line.replace(s_target, target))
+                        tokens = set() #set of node candidates
+                        doc = nlp(line.replace(s_target, target)) #nlp processing
                        
-                        if target in [t.text for t in doc]:
+                        if target in [t.text for t in doc]: #better selection
                            
                            for tok in doc:
                                
-                                text = tok.text
-                                tag = tok.tag_
+                                text = tok.text #string value
+                                tag = tok.tag_ #pos tag
                                
+                                #doesn't add target word to nodes
                                if text == target:
                                    pass
                                
+                                #doesn't add stop words to nodes
                                elif text in stop_words:
                                    pass
                                
+                                #only adds tokens with allowed tags to nodes
                                elif tag in allowed_tags:
                                    tokens.add(tok.text)
                                    
+                            #if there are enough (good) tokens in paragraph
                            if len(tokens) >= min_context_size:
                                for token in tokens:
                                    
+                                    #updates counts for nodes
                                    if token in node_freq:
                                        node_freq[token] += 1
                                    else:
@@ -86,11 +95,13 @@ def frequencies(corpus_path, target):
                                
                                for edge in {(x,y) for x in tokens for y in tokens if x < y}:
                                    
+                                    #updates counts for edges
                                    if edge in edge_freq:
                                        edge_freq[edge] += 1
                                    else:
                                        edge_freq[edge] = 1
            
+            #if a file is corrupted (can't always be catched with if-else)
            except UnicodeDecodeError:
                
                pass
@@ -98,10 +109,13 @@ def frequencies(corpus_path, target):
        
        i += 1
    
+    #update print
    print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target)
+    
    return node_freq, edge_freq


+#build graph from frequency dictionaries
 def build_graph(node_freq, edge_freq):
    
    min_node_freq = config.min_node_freq
@@ -110,11 +124,13 @@ def build_graph(node_freq, edge_freq):
    
    G = nx.Graph()
    
+    #node : node frequency
    for key, value in node_freq.items():
        
        if value >= min_node_freq:
            G.add_node(key)
            
+    #edge : edge frequency
    for key, value in edge_freq.items():
        
        if value < min_edge_freq:
@@ -130,33 +146,37 @@ def build_graph(node_freq, edge_freq):
    return G


+#Identifies senses by choosing nodes with high degrees
 def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
    
    min_neighbors = config.min_neighbors
    threshold = config.threshold
    
    G = deepcopy(graph)
-    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # -1 to sort descending (...3 -> 2 -> 1...)
-    H = list()
+    V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True) # sorts according to degree
+    H = list() #output list
    
    while V:
        
-        v = V[0]
+        v = V[0] #best hub candidate
        
        if G.degree[v] >= min_neighbors:
        
-            mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #mfn: most frequent neighbors
+            mfn = sorted(G.adj[v], key=lambda key: edge_freq[v,key] if v < key else edge_freq[key, v], reverse=True)[:min_neighbors] #most frequent neighbors
            
-            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold:
+            if np.mean([G.edges[v,n]['weight'] for n in mfn]) < theshold: #if the median weight of the most frequent neighbors is under threshold
                
                H.append(v)
            
+                #removes neighbors of new hub as hub candidates
                for nbr in deepcopy(G).adj[v]:
                
                    G.remove_node(nbr)
                
+            #removes hub candidate
            G.remove_node(v)
            
+            #reorderd potential hubs after deletions
            V = sorted(G.nodes, key=lambda key: G.degree[key], reverse=True)
        
        else:
@@ -170,7 +190,7 @@ def root_hubs(graph, edge_freq, min_neighbors=4, theshold=0.8):
 def components(graph, hubs, target):
    
    G = deepcopy(graph)
-    H = hubs
+    H = hubs #root hubs
    t = target
    
    #G.add_node(t)
@@ -179,6 +199,7 @@ def components(graph, hubs, target):
        
    T = nx.minimum_spanning_tree(G)
    
+    #removes singletons
    for node in deepcopy(T).nodes:
        if len(T.adj[node]) == 0:
            T.remove_node(node)
@@ -186,17 +207,22 @@ def components(graph, hubs, target):
    return T


+#Calculates score for a given path in a minimum spanning tree
 def score(graph, from_node, to_node):
    
+    #if correct tree
    if nx.has_path(graph, from_node, to_node):
                
+        # calculates shortest path (approximation for path with lowest total weight)
        path = nx.shortest_path(graph, from_node, to_node, 'weight')
        total_weight = 0
    
+        #adds weights of every sub-path
        for i in range(1, len(path)):
            sub_from, sub_to = path[i-1], path[i]
            total_weight += graph[sub_from][sub_to]['weight']
    
+        #the further the path, the lower the score
        return 1/(1+total_weight)
        
    else:
@@ -204,47 +230,52 @@ def score(graph, from_node, to_node):
        return 0


+# Basically Word Sense Disambiguation, matches context to sense
 def disambiguate(mst, hubs, contexts, target=""):
    
    target = target.replace('_', ' ')
-    T = mst
-    H = hubs
-    C = [c.lower().strip().replace(target, '') for c in contexts]
+    T = mst #minimum spanning tree
+    H = hubs #root hubs
+    C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts
    
-    score_dict = dict()
-    result = list()
+    score_dict = dict() #memoisation for scores
+    result = list() #output of function

    for c in C:
        
-        idx = C.index(c) + 1
+        idx = C.index(c) + 1 #index based on position in list
        
        #if no sense is found for a target word, we should assume that there only is one sense
-        if len(H) == 0:
+        if len(H) == 0: 
            
-            result.append((1, idx))
+            result.append((1, idx, 0))
        
        else:
            
-            doc = nlp(c)
-            texts = [tok.text for tok in doc]
+            doc = nlp(c) #parsed context
+            texts = [tok.text for tok in doc] #tokens
            
            scores = np.zeros(len(H)) #initialise with zeros for every sense
            
            for text in texts:
                
-                if text in T.nodes:
+                if text in T.nodes: #if word wasn't filtered out
                    
-                    new_scores = list()
+                    new_scores = list() #scores to be added to total scores
                    
-                    for h in H:
-                        if (text, h) in score_dict:
+                    for h in H: #for each hub
+                        
+                        if (text, h) in score_dict: #memoisation
+                            
                            new_scores.append(score_dict[(text,h)])
+                        
                        else:
+                            
                            new_score = score(T, text, h)
                            new_scores.append(new_score)
-                            score_dict[(text,h)] = new_scores
+                            score_dict[(text,h)] = new_score #memoisation
                        
-                    scores = np.add(scores, new_scores)
+                    scores = scores + np.array(new_scores)
                
                else:
                
@@ -257,22 +288,32 @@ def disambiguate(mst, hubs, contexts, target=""):
            
            else:
                
-                result.append((np.argmax(scores)+1, idx))
+                #applies sense with the highest score to context
+                max_score = np.max(scores)
+                argmax_score = np.argmax(scores)
+                
+                #clusters begin at 1
+                result.append((argmax_score + 1, idx))

    return result


+# our main function, here the main stepps for word sense induction are called
 def WSI(topic_id, topic_name, results):
    
+    #buffer for useful information
    out_buffer = '\n'
    
+    #paths for input (corpus) and output(directory)
    corpus_path = config.corpus
    output_path = config.output
            
+    #removes trailing new_lines
    old_target = topic_name.strip() #original target
    out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
    
-    if old_target[:4] == 'the_' and old_target.count('_') >= 2: #hard coded 'the'-protection
+    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
+    if old_target[:4] == 'the_' and old_target.count('_') >= 2:
        
        target = old_target[4:]
        
@@ -280,37 +321,46 @@ def WSI(topic_id, topic_name, results):
        
        target = old_target
    
+    #writes headline for output files
    f = open(output_path+target+'.absinth', 'w')
    f.write('subTopicID\tresultID\n')
    
+    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
    print('[a]', 'Counting nodes and edges.', old_target)
    node_freq, edge_freq = frequencies(corpus_path, target)
-    out_buffer += '[A] Nodes: {}\tEdges:{}\n'.format(str(len(node_freq)), str(len(edge_freq)))
+    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq)))
    
+    #builds graph from these dictionaries, also applies multiple filters
    print('[a]', 'Building graph.', old_target)
    G = build_graph(node_freq, edge_freq)
    
+    #finds root hubs (senses) within the graph + more filters for these
    print('[a]', 'Collecting root hubs.', old_target)
    H = root_hubs(G, edge_freq)
    out_buffer += '[A] Root hubs:\n'
    
-    i = 1
+    #adds sense inventory to buffer with some common neighbors for context
+    i = 1 #sense index
    for h in H:
        
        mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
        out_buffer += (' {}. {}: {}\n'.format(i, h, mfn))
        i += 1
    
+    #performs minimum_spanning_tree algorithm on graph
    print('[a]', 'Building minimum spanning tree.', old_target)
    T = components(G, H, target)

+    #matches senses to clusters
    print('[a]', 'Disambiguating results.', old_target)
    D = disambiguate(T, H, results[topic_id], target)
    out_buffer += ('[A] Mapping: '+ str(D) + '\n')
    
+    #prints buffer
    print('[a]', 'Writing to file.', old_target)
    print(out_buffer)
    
+    #writes clustering to file
    for d in D:
        
        f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n')
@@ -320,8 +370,13 @@ def WSI(topic_id, topic_name, results):

 if __name__ == '__main__':
    
-    data_path = config.dataset
+    # If absinth.py is run in test environment
+    if '-t' in sys.argv:
+        data_path = config.test
+    else:
+        data_path = config.dataset
    
+    # results.txt includes the queries for a given target word
    results = dict()
    
    with open(data_path+'results.txt', 'r') as results_file:
@@ -329,14 +384,15 @@ if __name__ == '__main__':
        for line in results_file.readlines()[1:]:
            
            l = line.split('\t')
-            id1, _ = l[0].split('.')
+            id1, _ = l[0].split('.') #the second part of the id is ignored, as it is identical to the list index
            
            if id1 not in results:
                results[id1]=list()
                
-            results[id1].append(" ".join(l[2:]))
-            
+            results[id1].append(" ".join(l[2:])) # here I join title and snippet, the URL is ignored
            
+    
+    # topics.txt is a list of target words
    topics = dict()
    
    with open(data_path+'topics.txt', 'r') as topics_file:
@@ -346,7 +402,10 @@ if __name__ == '__main__':
            l = line.split('\t')
            topics[l[0]] = l[1]
    
+    # multiprocessing
    with Pool(4) as pool:
+        # calls WSI() for for topics at a time
        pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
+        
    #for key, value in topics.items():
    #    WSI(key, value, results)