Skip to content
Snippets Groups Projects
Commit deb3f84a authored by Victor Zimmermann's avatar Victor Zimmermann
Browse files

Add method to search for paragraphs containing word.

parent 882d2cfd
No related branches found
No related tags found
No related merge requests found
import os
import nltk
data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
wiki_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'
topics = open(data_path+'/topics.txt', 'r').readlines()[1:]
topics = [line.strip('\n').split('\t') for line in topics]
results = open(data_path+'/results.txt', 'r').readlines()[1:]
results = [line.strip('\n').split('\t') for line in results]
def get_paragraphs(word):
files = [wiki_path+'/'+f for f in os.listdir(wiki_path)]
paragraphs = list()
space_word = word.replace('_', ' ')
for f in files:
with open(f, 'r') as source:
for line in source:
line = line.lower()
if space_word in line:
new_line = line.replace(space_word, word)
tokens = nltk.word_tokenize(new_line)
if word in tokens:
paragraphs.append(tokens)
print(tokens)
return paragraphs
#for topic in topics:
print(get_paragraphs('the_block'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment