From e9fa53463ae2640ba705b0d2553cf7611ddf8d2a Mon Sep 17 00:00:00 2001
From: toyota <toyota@cl.uni-heidelberg.de>
Date: Fri, 30 Mar 2018 18:14:46 +0200
Subject: [PATCH] add readme lib

---
 lib/README.md              | 110 +++++++++++++++++++++++++++++++++++--
 lib/preprocess_wikitext.py |  13 ++---
 2 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/lib/README.md b/lib/README.md
index 00e705e..b904bd5 100644
--- a/lib/README.md
+++ b/lib/README.md
@@ -1,13 +1,113 @@
 # CHERTOY - Creating language model with sent2vec
 
-This is an 
+This is an implementation to provide necessary pre-processing steps for modeling an own sent2vec model which is used in the experiments. The two language models we built are a uni-gram and a bi-gram model over the wikipedia 2017 corpus.
 
 ## RUNNING INSTRUCTIONS
 
-### Input files:
+## Pre-Processing Wikipedia Dump
 
-### Output files:
+Download Wikipedia Dump
+- Wikipedia Dumps for the english language is provided on https://meta.wikimedia.org/wiki/Data_dump_torrents#English_Wikipedia
+- In our model we used enwiki-20170820-pages-articles-multistream.xml.bz2 (14.1 GiB)
 
-### Create a folder structure:
+Dependencies:
+- wikiExtractor: http://attardi.github.io/wikiextractor
+- fasttext: https://github.com/facebookresearch/fastText
+- sent2vec: https://github.com/epfml/sent2vec
+
+
+First of all the wikipedia text needs to be extracted from the provided XML.
+-extracted file: enwiki-20170820-pages-articles-multistream.xml (21.0GB)
+
+From the XML the plain text will be extracted using wikiExtractor:
+WikiExtractor.py -o OUTPUT-DIRECTORY INPUT-XML-FILE
+
+_Example_
+WikiExtractor.py -o /wikitext enwiki-20170820-pages-articles-multistream.xml
+
+WikiExtractor will create several directories AA, AB, AC, ...,  CH with a total size of 6.2GB. Each directory contains 100 txt documents (besides CH -> 82).
+Each article begins with an ID such as <doc id="12" url="https://en.wikipedia.org/wiki?curid=12" title="Anarchism">. Also comments in Parentheses are provided.
+Using preprocess_wikitext.py we delete all IDs, parentheses with their content and also quotes like ' or " and getting a plain wikipedia text. The text file contain one sentence per line. 
+
+_Usage_
+python3 preprocess_wikitext.py input_directory_path output_txt_file_path
+
+_Examle_
+python3 preprocess_wikitext.py /home/utaemon/Semantik_Projekt/results/ /home/utaemon/Semantik_Projekt/plain/
+
+The text file are organized in the same way as in the input text files in directories AA, AB...
+To collect all texts into one file collect_all_wiki_in_one.py can be used. In our case the output file will have a total size of 4.1GB.
+
+_Usage_
+python3 preprocess_wikitext.py input_directory_path output_dir_file_path
+
+_Example_
+python3 preprocess_wikitext.py /home/utaemon/Semantik_Projekt/plain/ /home/utaemon/Semantik_Projekt/all_plain_texts.txt
+
+
+## Create new sent2vec model
+
+Move to the sent2vec directory. Here you can run following instruction in the terminal:
+
+For Uni-grams:
+./fasttext sent2vec -input /proj/toyota/all_plain_texts.txt -output /proj/toyota/wiki_model_unigram -minCount 1 -dim 700 -epoch 10 -lr 0.2 -wordNgrams 1 -loss ns -neg 10 -thread 20 -t 0.000005 -dropoutK 4 -minCountLabel 20 -bucket 4000000
+
+For Bi-grams:
+./fasttext sent2vec -input /proj/toyota/all_plain_texts.txt -output /proj/toyota/wiki_model_bigram -minCount 1 -dim 700 -epoch 10 -lr 0.2 -wordNgrams 2 -loss ns -neg 10 -thread 20 -t 0.000005 -dropoutK 4 -minCountLabel 20 -bucket 4000000
+
+In our case it will make a model over 321 million words and a vocabulary containing 4518148 words.
+
+### Output models:
+Both models are provided on /proj/toyota on the server of the Institute of Computer Linguistics Heidelberg.
+
+Uni-gram model:
+wiki_model_unigram.bin (25.4GB)
+
+Bi-gram model:
+wiki_model_bigram.bin (36.6GB)
+
+
+### LICENSES 
+# wikiExtractor
+GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+# fasttext
+BSD License
+
+For fastText software
+
+Copyright (c) 2016-present, Facebook, Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# sent2vec (for code and pre-trained models)
+Matteo Pagliardini, Prakhar Gupta, Martin Jaggi, Unsupervised Learning of Sentence Embeddings using Compositional n-Gram Features NAACL 2018.
 
-### RUN THE SYSTEM:
diff --git a/lib/preprocess_wikitext.py b/lib/preprocess_wikitext.py
index 3f1c49d..e2f55fa 100644
--- a/lib/preprocess_wikitext.py
+++ b/lib/preprocess_wikitext.py
@@ -3,9 +3,9 @@
 
 """
 @author: Tatjana Chernenko, Utaemon Toyota
-@usage: python3 preprocess_wikitext.py input_directory_path output_txt_file_path
+@usage: python3 preprocess_wikitext.py input_directory_path output_dir_file_path
 @course: Formale Semantik WS 2017/18
-@description: For each extractet Wikipedia text the programm will preprocess the text and removing e.g. all comments in brackets
+@description: For each extractet Wikipedia text the programm will preprocess the text and removing e.g. all comments in parentheses
     and creating new files with each sentence in one line.
 """
 
@@ -18,10 +18,10 @@ import sys
 rootdir_glob = "/home/utaemon/Semantik_Projekt/results/"
 plain_path = "/home/utaemon/Semantik_Projekt/plain/"
 
-def remove_apostrophs(text):
+def remove_quotes(text):
     new_text = ""
     for token in text.split():
-        # --- removes all unnecessary apostrophs
+        # --- removes all unnecessary quotes
         temp_punct = ""
         if re.findall("(\?|:|!|\.|,|;)$", token):
             temp_punct = token[-1]
@@ -77,7 +77,7 @@ def get_plain_text(file_path):
                 continue
             else:
                 text += line + " "
-        text = remove_apostrophs(text)
+        text = remove_quotes(text)
         text = eliminate_brackets(text, "\(", "\)")
         text = re.sub("\.\.+","", text)
         text = re.sub(r'\s+(\?|:|!|\.|,|;)', r'\1', text)
@@ -100,7 +100,6 @@ def split_lines(function):
 
 
 #--------------write file
-
 def write_plain_file(input_path = rootdir_glob, target_path=plain_path):
     file_list = [f for f in Path(input_path).glob('**/*') if f.is_file()]
     for file in file_list:
@@ -127,4 +126,4 @@ if __name__ == "__main__":
     elif (len(sys.argv)) == 3:
         write_plain_file(sys.argv[1],sys.argv[2])
     else:
-        print("@usage: python3 input_directory_path output_txt_file_path")
\ No newline at end of file
+        print("@usage: python3 input_directory_path output_txt_file_path")
-- 
GitLab