Redone output, shuffle corpus, disambiguation output is now a dict

b5581eb6 · Victor Zimmermann · 642789bc · b5581eb6
Commit b5581eb6 authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -8,13 +8,15 @@ import numpy as np # for calculations
 import config
 import spacy # for nlp
 from multiprocessing import Pool
+import random
 nlp = spacy.load('en') # standard english nlp
 #counts occurences of nodes and cooccurrences
 def frequencies(corpus_path, target):
+    random.seed(1)
    stop_words = set(stopwords.words('english') + config.stop_words)
    allowed_tags = config.allowed_tags
    min_context_size = config.min_context_size
@@ -24,13 +26,15 @@ def frequencies(corpus_path, target):
    node_freq = dict() #counts (potential) nodes
    edge_freq = dict() #counts (potential) edges
-    files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
    s_target = target.replace('_', ' ') #target word with spaces
+    files = [corpus_path + f for f in os.listdir(corpus_path)] #file names of corpus files
+    random.shuffle(files)
    i = 0 #for update print statements
    for f in files:
-        if i % int(len(files)/10) == 0: #prints update after every 10th of the corpus is parsed
+        if i % int(len(files)/11) == 0: #prints update after every 10th of the corpus is parsed
            file_ratio = i/len(files[:])
            max_node_ratio = len(node_freq)/max_nodes
@@ -41,7 +45,7 @@ def frequencies(corpus_path, target):
            #uses the ratio closest to 100%.
            percentage = int((max(ratios))*100)
-            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq)), target)
+            print('[a] ~{:02d}%\tNodes: {}\tEdges: {}.'.format(percentage, len(node_freq), len(edge_freq))+'\t('+target+')')
        #checks maximum node values
        if len(node_freq) > max_nodes:
@@ -239,63 +243,61 @@ def disambiguate(mst, hubs, contexts, target=""):
    C = [c.lower().strip().replace(target, '') for c in contexts] #cleaned up contexts
    score_dict = dict() #memoisation for scores
-    result = list() #output of function
+    mapping_dict = {topic:[] for topic in range(1,len(H)+1)} #output of function
+    #if no sense is found for a target word, we should assume that there only is one sense
+    if len(H) == 0: 
+        return {0:[i for i in range(1, len(C)+1)]}
    for c in C:
        idx = C.index(c) + 1 #index based on position in list
+        doc = nlp(c) #parsed context
+        texts = [tok.text for tok in doc] #tokens
-        #if no sense is found for a target word, we should assume that there only is one sense
+        scores = np.zeros(len(H)) #initialise with zeros for every sense
-        if len(H) == 0: 
-            result.append((1, idx, 0))
-        else:
+        for text in texts:
-            doc = nlp(c) #parsed context
-            texts = [tok.text for tok in doc] #tokens
-            scores = np.zeros(len(H)) #initialise with zeros for every sense
+            if text in T.nodes: #if word wasn't filtered out
-            for text in texts:
-                if text in T.nodes: #if word wasn't filtered out
+                new_scores = list() #scores to be added to total scores
-                    new_scores = list() #scores to be added to total scores
+                for h in H: #for each hub
-                    for h in H: #for each hub
+                    if (text, h) in score_dict: #memoisation
-                        if (text, h) in score_dict: #memoisation
-                            new_scores.append(score_dict[(text,h)])
-                        else:
+                        new_scores.append(score_dict[(text,h)])
-                            new_score = score(T, text, h)
+                    else:
-                            new_scores.append(new_score)
-                            score_dict[(text,h)] = new_score #memoisation
-                    scores = scores + np.array(new_scores)
+                        new_score = score(T, text, h)
+                        new_scores.append(new_score)
-                else:
+                        score_dict[(text,h)] = new_score #memoisation
-                    pass
+                scores = scores + np.array(new_scores)
-            #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
+            else:
-            if np.max(scores) == 0:
                pass
+            #if the disambiguator could not detect a sense, it should return a singleton, ie. nothing
+        if np.max(scores) == 0:
-            else:
+            pass
-                #applies sense with the highest score to context
+        else:
-                max_score = np.max(scores)
-                argmax_score = np.argmax(scores)
-                #clusters begin at 1
+            #applies sense with the highest score to context
-                result.append((argmax_score + 1, idx))
+            max_score = np.max(scores)
+            argmax_score = np.argmax(scores)
+            #clusters begin at 1
+            mapping_dict[argmax_score + 1].append(idx)
-    return result
+    return mapping_dict
 # our main function, here the main stepps for word sense induction are called
@@ -326,16 +328,16 @@ def WSI(topic_id, topic_name, results):
    f.write('subTopicID\tresultID\n')
    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
-    print('[a]', 'Counting nodes and edges.', old_target)
+    print('[a]', 'Counting nodes and edges.\t('+old_target+')')
    node_freq, edge_freq = frequencies(corpus_path, target)
    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq)))
    #builds graph from these dictionaries, also applies multiple filters
-    print('[a]', 'Building graph.', old_target)
+    print('[a]', 'Building graph.\t('+old_target+')')
    G = build_graph(node_freq, edge_freq)
    #finds root hubs (senses) within the graph + more filters for these
-    print('[a]', 'Collecting root hubs.', old_target)
+    print('[a]', 'Collecting root hubs.\t('+old_target+')')
    H = root_hubs(G, edge_freq)
    out_buffer += '[A] Root hubs:\n'
@@ -344,26 +346,29 @@ def WSI(topic_id, topic_name, results):
    for h in H:
        mfn = sorted(G.adj[h], key=lambda x: edge_freq[h,x] if h < x else edge_freq[x, h], reverse=True)[:6]
-        out_buffer += (' {}. {}: {}\n'.format(i, h, mfn))
+        out_buffer += (' {}. {}: {}\n'.format(i, h, ', '.join(mfn)))
        i += 1
    #performs minimum_spanning_tree algorithm on graph
-    print('[a]', 'Building minimum spanning tree.', old_target)
+    print('[a]', 'Building minimum spanning tree.\t('+old_target+')')
    T = components(G, H, target)
    #matches senses to clusters
-    print('[a]', 'Disambiguating results.', old_target)
+    print('[a]', 'Disambiguating results.\t('+old_target+')')
    D = disambiguate(T, H, results[topic_id], target)
-    out_buffer += ('[A] Mapping: '+ str(D) + '\n')
+    out_buffer += ('[A] Mapping: \n')
+    for cluster,results in D.items():
+        out_buffer += (' {}. : {}\n'.format(cluster, ', '.join([str(r) for r in results])))
    #prints buffer
-    print('[a]', 'Writing to file.', old_target)
+    print('[a]', 'Writing to file.\t('+old_target+')')
    print(out_buffer)
    #writes clustering to file
-    for d in D:
+    for cluster,results in D.items():
+        for result in results:
-        f.write(topic_id+'.'+str(d[0])+'\t'+topic_id+'.'+str(d[1])+'\n')
+            f.write(topic_id+'.'+str(cluster)+'\t'+topic_id+'.'+str(result)+'\n')
    f.close()
@@ -394,13 +399,15 @@ if __name__ == '__main__':
    # topics.txt is a list of target words
    topics = dict()
+    processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)]
    with open(data_path+'topics.txt', 'r') as topics_file:
        for line in topics_file.readlines()[1:]:
            l = line.split('\t')
-            topics[l[0]] = l[1]
+            if l[1].strip() not in processed_topics:
+                topics[l[0]] = l[1]
    # multiprocessing
    with Pool(4) as pool: