import os
import nltk

data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE'
wiki_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test'

topics = open(data_path+'/topics.txt', 'r').readlines()[1:]
topics = [line.strip('\n').split('\t') for line in topics]
results = open(data_path+'/results.txt', 'r').readlines()[1:]
results = [line.strip('\n').split('\t') for line in results]

def get_paragraphs(word):
    files = [wiki_path+'/'+f for f in os.listdir(wiki_path)]
    paragraphs = list()
    space_word = word.replace('_', ' ')
    for f in files:
        with open(f, 'r') as source:
            for line in source:
                line = line.lower()
                if space_word in line:
                    new_line = line.replace(space_word, word)
                    tokens = nltk.word_tokenize(new_line)
                    if word in tokens:
                        paragraphs.append(tokens)
                        print(tokens)
    return paragraphs

#for topic in topics:
print(get_paragraphs('the_block'))