import os import nltk data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE' wiki_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' topics = open(data_path+'/topics.txt', 'r').readlines()[1:] topics = [line.strip('\n').split('\t') for line in topics] results = open(data_path+'/results.txt', 'r').readlines()[1:] results = [line.strip('\n').split('\t') for line in results] def get_paragraphs(word): files = [wiki_path+'/'+f for f in os.listdir(wiki_path)] paragraphs = list() space_word = word.replace('_', ' ') for f in files: with open(f, 'r') as source: for line in source: line = line.lower() if space_word in line: new_line = line.replace(space_word, word) tokens = nltk.word_tokenize(new_line) if word in tokens: paragraphs.append(tokens) print(tokens) return paragraphs #for topic in topics: print(get_paragraphs('the_block'))