diff --git a/graphs.py b/graphs.py new file mode 100644 index 0000000000000000000000000000000000000000..86df28556cd4fdb0af6cd2c82329be23f9a95a1f --- /dev/null +++ b/graphs.py @@ -0,0 +1,29 @@ +import os +import nltk + +data_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/WSI-Evaluator/datasets/MORESQUE' +wiki_path = '/home/students/zimmermann/Courses/ws17/fsem/absinth/test' + +topics = open(data_path+'/topics.txt', 'r').readlines()[1:] +topics = [line.strip('\n').split('\t') for line in topics] +results = open(data_path+'/results.txt', 'r').readlines()[1:] +results = [line.strip('\n').split('\t') for line in results] + +def get_paragraphs(word): + files = [wiki_path+'/'+f for f in os.listdir(wiki_path)] + paragraphs = list() + space_word = word.replace('_', ' ') + for f in files: + with open(f, 'r') as source: + for line in source: + line = line.lower() + if space_word in line: + new_line = line.replace(space_word, word) + tokens = nltk.word_tokenize(new_line) + if word in tokens: + paragraphs.append(tokens) + print(tokens) + return paragraphs + +#for topic in topics: +print(get_paragraphs('the_block'))