Checks if topic already processed, allows parallel processes.

94c9807f · Victor Zimmermann · 2c81f0fe · 94c9807f
Commit 94c9807f authored 7 years ago by Victor Zimmermann
--- a/src/absinth.py
+++ b/src/absinth.py
@@ -114,7 +114,7 @@ def frequencies(corpus_path, target):
        i += 1
    #update print
-    print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target)
+    print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')')
    return node_freq, edge_freq
@@ -312,6 +312,10 @@ def WSI(topic_id, topic_name, results):
    #removes trailing new_lines
    old_target = topic_name.strip() #original target
+    if old_target.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
+        return None
    out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
    #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
@@ -330,11 +334,11 @@ def WSI(topic_id, topic_name, results):
    #counts occurences of single words, as well as cooccurrences, saves it in dictionary
    print('[a]', 'Counting nodes and edges.\t('+old_target+')')
    node_freq, edge_freq = frequencies(corpus_path, target)
-    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq)))
    #builds graph from these dictionaries, also applies multiple filters
    print('[a]', 'Building graph.\t('+old_target+')')
    G = build_graph(node_freq, edge_freq)
+    out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges)))
    #finds root hubs (senses) within the graph + more filters for these
    print('[a]', 'Collecting root hubs.\t('+old_target+')')
@@ -399,15 +403,13 @@ if __name__ == '__main__':
    # topics.txt is a list of target words
    topics = dict()
-    processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)]
    with open(data_path+'topics.txt', 'r') as topics_file:
        for line in topics_file.readlines()[1:]:
            l = line.split('\t')
-            if l[1].strip() not in processed_topics:
+            topics[l[0]] = l[1]
-                topics[l[0]] = l[1]
    # multiprocessing
    with Pool(4) as pool:
@@ -415,4 +417,4 @@ if __name__ == '__main__':
        pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
    #for key, value in topics.items():
-    #    WSI(key, value, results)
+       #WSI(key, value, results)