upload Performances Table and Code for new sent2vec models

4a5a9846 · toyota · d5e986a7 · 4a5a9846 · 4a5a9846 · 4a5a9846
Commit 4a5a9846 authored 7 years ago by toyota
--- a/Performances_Table.pdf
+++ b/Performances_Table.pdf
--- a/lib/collect_all_wiki_in_one.py
+++ b/lib/collect_all_wiki_in_one.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+@author: Tatjana Chernenko, Utaemon Toyota
+@usage: python3 collect_all_wiki_in_one.py input_directory_path output_txt_file_path
+@course: Formale Semantik WS 2017/18
+@description: Collects all Wikipedia Dump Texts into one large text file.
+"""
+from pathlib import Path
+import sys
+rootdir_glob = "/proj/toyota/plain2/"
+target_file = "/proj/toyota/all_plain_text2.txt"
+def collect_all_files_in_one(input_path = rootdir_glob, output_path = target_file):
+    rootdir = Path(input_path)
+    file_list = [f for f in rootdir.glob('**/*') if f.is_file()]
+    for file in file_list:
+        with open(str(file),"r") as input:
+            with open(output_path, "a") as output:
+                output.write(input.read())
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        collect_all_files_in_one()
+    elif (len(sys.argv)) == 3:
+        collect_all_files_in_one(sys.argv[1],sys.argv[2])
+    else:
+        print("@usage: python3 input_directory_path output_txt_file_path")
\ No newline at end of file
--- a/lib/preprocess_wikitext.py
+++ b/lib/preprocess_wikitext.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+@author: Tatjana Chernenko, Utaemon Toyota
+@usage: python3 preprocess_wikitext.py input_directory_path output_txt_file_path
+@course: Formale Semantik WS 2017/18
+@description: For each extractet Wikipedia text the programm will preprocess the text and removing e.g. all comments in brackets
+    and creating new files with each sentence in one line.
+"""
+import re
+from pathlib import Path
+import os
+import errno
+import sys
+rootdir_glob = "/home/utaemon/Semantik_Projekt/results/"
+plain_path = "/home/utaemon/Semantik_Projekt/plain/"
+def remove_apostrophs(text):
+    new_text = ""
+    for token in text.split():
+        # --- removes all unnecessary apostrophs
+        temp_punct = ""
+        if re.findall("(\?|:|!|\.|,|;)$", token):
+            temp_punct = token[-1]
+            token = token[:-1]
+        if re.findall("''+",token):
+            token = re.sub("''+","'",token)
+        while token.startswith("'") or token.endswith("'") or token.startswith('"') or token.endswith('"'):
+            if token.startswith("'"):
+                token = token[1:]
+            elif token.endswith("'"):
+                token = token[:-1]
+            elif token.startswith('"'):
+                token = token[1:]
+            elif token.endswith('"'):
+                token = token[:-1]
+        new_text += token + temp_punct + " "
+    return new_text
+def eliminate_brackets(text, patternstart, patternend):
+    count_brackets = 0
+    openbracket = patternstart
+    closingbracket = patternend
+    new_str = ""
+    for token in text.split():
+        if re.findall(openbracket, token) or re.findall(closingbracket, token):
+            new_token = ""
+            for char in token:
+                if re.findall(openbracket, char):
+                    count_brackets += 1
+                elif re.findall(closingbracket, char):
+                    count_brackets -= 1
+                elif count_brackets == 0:
+                    new_token += char
+            new_str += new_token + " "
+        elif count_brackets != 0:
+            continue
+        elif count_brackets == 0:
+            new_str += token + " "
+    return new_str
+def get_plain_text(file_path):
+    with open(str(file_path)) as file:
+        text = ""
+        get_title = False
+        for line in file:
+            if line.startswith("<doc"):
+                get_title = True
+                continue
+            elif get_title == True:
+                get_title = False
+                continue
+            elif line.startswith("</doc>"):
+                continue
+            else:
+                text += line + " "
+        text = remove_apostrophs(text)
+        text = eliminate_brackets(text, "\(", "\)")
+        text = re.sub("\.\.+","", text)
+        text = re.sub(r'\s+(\?|:|!|\.|,|;)', r'\1', text)
+        text = re.sub(r"\s\s+"," ", text)
+        return text
+def split_lines(function):
+    pattern = r"(\?|:|!|\.|;)\s"
+    text = re.split(pattern, function)
+    new_text = ""
+    sentence = True
+    for elm in text:
+        if sentence == True:
+            new_text += elm
+            sentence = False
+        else:
+            new_text += elm + "\n"
+            sentence = True
+    return new_text
+#--------------write file
+def write_plain_file(input_path = rootdir_glob, target_path=plain_path):
+    file_list = [f for f in Path(input_path).glob('**/*') if f.is_file()]
+    for file in file_list:
+        print (file)
+        file_li = str(file).split("/")
+        dir_name = file_li[-2]
+        file_name = file_li[-1]
+        new_file_path = target_path + dir_name + "/" + file_name
+        #https://stackoverflow.com/questions/12517451/automatically-creating-directories-with-file-output
+        if not os.path.exists(os.path.dirname(new_file_path)):
+            try:
+                os.makedirs(os.path.dirname(new_file_path))
+            except OSError as exc:  # Guard against race condition
+                if exc.errno != errno.EEXIST:
+                    raise
+        plain_text = get_plain_text(file)
+        plain_text = split_lines(plain_text)
+        with open(new_file_path, "w") as file:
+            file.write(plain_text + "\n")
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        write_plain_file()
+    elif (len(sys.argv)) == 3:
+        write_plain_file(sys.argv[1],sys.argv[2])
+    else:
+        print("@usage: python3 input_directory_path output_txt_file_path")
\ No newline at end of file