From 94c9807fa9fa49badd7a40dc3a6e93dc4e4d5dcd Mon Sep 17 00:00:00 2001 From: Victor Zimmermann <zimmermann@cl.uni-heidelberg.de> Date: Tue, 13 Mar 2018 16:57:47 +0100 Subject: [PATCH] Checks if topic already processed, allows parallel processes. --- src/absinth.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/absinth.py b/src/absinth.py index 00f49f7..9715e07 100644 --- a/src/absinth.py +++ b/src/absinth.py @@ -114,7 +114,7 @@ def frequencies(corpus_path, target): i += 1 #update print - print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target) + print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')') return node_freq, edge_freq @@ -312,6 +312,10 @@ def WSI(topic_id, topic_name, results): #removes trailing new_lines old_target = topic_name.strip() #original target + + if old_target.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]: + return None + out_buffer += ("[A] Word sense induction for '"+old_target+"':\n") #in topics longer than two words, the leading 'the' can generally be removed without changing the sense @@ -330,11 +334,11 @@ def WSI(topic_id, topic_name, results): #counts occurences of single words, as well as cooccurrences, saves it in dictionary print('[a]', 'Counting nodes and edges.\t('+old_target+')') node_freq, edge_freq = frequencies(corpus_path, target) - out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq))) #builds graph from these dictionaries, also applies multiple filters print('[a]', 'Building graph.\t('+old_target+')') G = build_graph(node_freq, edge_freq) + out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges))) #finds root hubs (senses) within the graph + more filters for these print('[a]', 'Collecting root hubs.\t('+old_target+')') @@ -399,15 +403,13 @@ if __name__ == '__main__': # topics.txt is a list of target words topics = dict() - processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)] with open(data_path+'topics.txt', 'r') as topics_file: for line in topics_file.readlines()[1:]: l = line.split('\t') - if l[1].strip() not in processed_topics: - topics[l[0]] = l[1] + topics[l[0]] = l[1] # multiprocessing with Pool(4) as pool: @@ -415,4 +417,4 @@ if __name__ == '__main__': pool.starmap(WSI, [(key, value, results) for key,value in topics.items()]) #for key, value in topics.items(): - # WSI(key, value, results) + #WSI(key, value, results) -- GitLab