diff --git a/Performances_Table.pdf b/Performances_Table.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f454d09a8766b3c920d81bb4597c37237a13844d Binary files /dev/null and b/Performances_Table.pdf differ diff --git a/lib/collect_all_wiki_in_one.py b/lib/collect_all_wiki_in_one.py new file mode 100644 index 0000000000000000000000000000000000000000..9752a5cc7e328a4a00a8303bdfe5d4a30aea7a2c --- /dev/null +++ b/lib/collect_all_wiki_in_one.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +@author: Tatjana Chernenko, Utaemon Toyota +@usage: python3 collect_all_wiki_in_one.py input_directory_path output_txt_file_path +@course: Formale Semantik WS 2017/18 +@description: Collects all Wikipedia Dump Texts into one large text file. +""" + +from pathlib import Path +import sys + +rootdir_glob = "/proj/toyota/plain2/" +target_file = "/proj/toyota/all_plain_text2.txt" + +def collect_all_files_in_one(input_path = rootdir_glob, output_path = target_file): + rootdir = Path(input_path) + file_list = [f for f in rootdir.glob('**/*') if f.is_file()] + for file in file_list: + with open(str(file),"r") as input: + with open(output_path, "a") as output: + output.write(input.read()) + +if __name__ == "__main__": + if len(sys.argv) == 1: + collect_all_files_in_one() + elif (len(sys.argv)) == 3: + collect_all_files_in_one(sys.argv[1],sys.argv[2]) + else: + print("@usage: python3 input_directory_path output_txt_file_path") \ No newline at end of file diff --git a/lib/preprocess_wikitext.py b/lib/preprocess_wikitext.py new file mode 100644 index 0000000000000000000000000000000000000000..3f1c49def1e0a3f14db87a6ff6acba00e0562a97 --- /dev/null +++ b/lib/preprocess_wikitext.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +@author: Tatjana Chernenko, Utaemon Toyota +@usage: python3 preprocess_wikitext.py input_directory_path output_txt_file_path +@course: Formale Semantik WS 2017/18 +@description: For each extractet Wikipedia text the programm will preprocess the text and removing e.g. all comments in brackets + and creating new files with each sentence in one line. +""" + +import re +from pathlib import Path +import os +import errno +import sys + +rootdir_glob = "/home/utaemon/Semantik_Projekt/results/" +plain_path = "/home/utaemon/Semantik_Projekt/plain/" + +def remove_apostrophs(text): + new_text = "" + for token in text.split(): + # --- removes all unnecessary apostrophs + temp_punct = "" + if re.findall("(\?|:|!|\.|,|;)$", token): + temp_punct = token[-1] + token = token[:-1] + if re.findall("''+",token): + token = re.sub("''+","'",token) + while token.startswith("'") or token.endswith("'") or token.startswith('"') or token.endswith('"'): + if token.startswith("'"): + token = token[1:] + elif token.endswith("'"): + token = token[:-1] + elif token.startswith('"'): + token = token[1:] + elif token.endswith('"'): + token = token[:-1] + new_text += token + temp_punct + " " + return new_text + +def eliminate_brackets(text, patternstart, patternend): + count_brackets = 0 + openbracket = patternstart + closingbracket = patternend + new_str = "" + for token in text.split(): + if re.findall(openbracket, token) or re.findall(closingbracket, token): + new_token = "" + for char in token: + if re.findall(openbracket, char): + count_brackets += 1 + elif re.findall(closingbracket, char): + count_brackets -= 1 + elif count_brackets == 0: + new_token += char + new_str += new_token + " " + elif count_brackets != 0: + continue + elif count_brackets == 0: + new_str += token + " " + return new_str + +def get_plain_text(file_path): + with open(str(file_path)) as file: + text = "" + get_title = False + for line in file: + if line.startswith("<doc"): + get_title = True + continue + elif get_title == True: + get_title = False + continue + elif line.startswith("</doc>"): + continue + else: + text += line + " " + text = remove_apostrophs(text) + text = eliminate_brackets(text, "\(", "\)") + text = re.sub("\.\.+","", text) + text = re.sub(r'\s+(\?|:|!|\.|,|;)', r'\1', text) + text = re.sub(r"\s\s+"," ", text) + return text + +def split_lines(function): + pattern = r"(\?|:|!|\.|;)\s" + text = re.split(pattern, function) + new_text = "" + sentence = True + for elm in text: + if sentence == True: + new_text += elm + sentence = False + else: + new_text += elm + "\n" + sentence = True + return new_text + + +#--------------write file + +def write_plain_file(input_path = rootdir_glob, target_path=plain_path): + file_list = [f for f in Path(input_path).glob('**/*') if f.is_file()] + for file in file_list: + print (file) + file_li = str(file).split("/") + dir_name = file_li[-2] + file_name = file_li[-1] + new_file_path = target_path + dir_name + "/" + file_name + #https://stackoverflow.com/questions/12517451/automatically-creating-directories-with-file-output + if not os.path.exists(os.path.dirname(new_file_path)): + try: + os.makedirs(os.path.dirname(new_file_path)) + except OSError as exc: # Guard against race condition + if exc.errno != errno.EEXIST: + raise + plain_text = get_plain_text(file) + plain_text = split_lines(plain_text) + with open(new_file_path, "w") as file: + file.write(plain_text + "\n") + +if __name__ == "__main__": + if len(sys.argv) == 1: + write_plain_file() + elif (len(sys.argv)) == 3: + write_plain_file(sys.argv[1],sys.argv[2]) + else: + print("@usage: python3 input_directory_path output_txt_file_path") \ No newline at end of file