Skip to content
Snippets Groups Projects
Commit 94c9807f authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Checks if topic already processed, allows parallel processes.

parent 2c81f0fe
No related branches found
No related tags found
No related merge requests found
...@@ -114,7 +114,7 @@ def frequencies(corpus_path, target): ...@@ -114,7 +114,7 @@ def frequencies(corpus_path, target):
i += 1 i += 1
#update print #update print
print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq)), target) print('[a] 100%\tNodes: {}\tEdges: {}.'.format(len(node_freq), len(edge_freq))+'\t('+target+')')
return node_freq, edge_freq return node_freq, edge_freq
...@@ -312,6 +312,10 @@ def WSI(topic_id, topic_name, results): ...@@ -312,6 +312,10 @@ def WSI(topic_id, topic_name, results):
#removes trailing new_lines #removes trailing new_lines
old_target = topic_name.strip() #original target old_target = topic_name.strip() #original target
if old_target.strip() in [f.replace('.absinth', '') for f in os.listdir(config.output)]:
return None
out_buffer += ("[A] Word sense induction for '"+old_target+"':\n") out_buffer += ("[A] Word sense induction for '"+old_target+"':\n")
#in topics longer than two words, the leading 'the' can generally be removed without changing the sense #in topics longer than two words, the leading 'the' can generally be removed without changing the sense
...@@ -330,11 +334,11 @@ def WSI(topic_id, topic_name, results): ...@@ -330,11 +334,11 @@ def WSI(topic_id, topic_name, results):
#counts occurences of single words, as well as cooccurrences, saves it in dictionary #counts occurences of single words, as well as cooccurrences, saves it in dictionary
print('[a]', 'Counting nodes and edges.\t('+old_target+')') print('[a]', 'Counting nodes and edges.\t('+old_target+')')
node_freq, edge_freq = frequencies(corpus_path, target) node_freq, edge_freq = frequencies(corpus_path, target)
out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(node_freq)), str(len(edge_freq)))
#builds graph from these dictionaries, also applies multiple filters #builds graph from these dictionaries, also applies multiple filters
print('[a]', 'Building graph.\t('+old_target+')') print('[a]', 'Building graph.\t('+old_target+')')
G = build_graph(node_freq, edge_freq) G = build_graph(node_freq, edge_freq)
out_buffer += '[A] Nodes: {}\tEdges: {}\n'.format(str(len(G.nodes)), str(len(G.edges)))
#finds root hubs (senses) within the graph + more filters for these #finds root hubs (senses) within the graph + more filters for these
print('[a]', 'Collecting root hubs.\t('+old_target+')') print('[a]', 'Collecting root hubs.\t('+old_target+')')
...@@ -399,15 +403,13 @@ if __name__ == '__main__': ...@@ -399,15 +403,13 @@ if __name__ == '__main__':
# topics.txt is a list of target words # topics.txt is a list of target words
topics = dict() topics = dict()
processed_topics = [f.replace('.absinth', '') for f in os.listdir(config.output)]
with open(data_path+'topics.txt', 'r') as topics_file: with open(data_path+'topics.txt', 'r') as topics_file:
for line in topics_file.readlines()[1:]: for line in topics_file.readlines()[1:]:
l = line.split('\t') l = line.split('\t')
if l[1].strip() not in processed_topics: topics[l[0]] = l[1]
topics[l[0]] = l[1]
# multiprocessing # multiprocessing
with Pool(4) as pool: with Pool(4) as pool:
...@@ -415,4 +417,4 @@ if __name__ == '__main__': ...@@ -415,4 +417,4 @@ if __name__ == '__main__':
pool.starmap(WSI, [(key, value, results) for key,value in topics.items()]) pool.starmap(WSI, [(key, value, results) for key,value in topics.items()])
#for key, value in topics.items(): #for key, value in topics.items():
# WSI(key, value, results) #WSI(key, value, results)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment