From e9fa53463ae2640ba705b0d2553cf7611ddf8d2a Mon Sep 17 00:00:00 2001 From: toyota <toyota@cl.uni-heidelberg.de> Date: Fri, 30 Mar 2018 18:14:46 +0200 Subject: [PATCH] add readme lib --- lib/README.md | 110 +++++++++++++++++++++++++++++++++++-- lib/preprocess_wikitext.py | 13 ++--- 2 files changed, 111 insertions(+), 12 deletions(-) diff --git a/lib/README.md b/lib/README.md index 00e705e..b904bd5 100644 --- a/lib/README.md +++ b/lib/README.md @@ -1,13 +1,113 @@ # CHERTOY - Creating language model with sent2vec -This is an +This is an implementation to provide necessary pre-processing steps for modeling an own sent2vec model which is used in the experiments. The two language models we built are a uni-gram and a bi-gram model over the wikipedia 2017 corpus. ## RUNNING INSTRUCTIONS -### Input files: +## Pre-Processing Wikipedia Dump -### Output files: +Download Wikipedia Dump +- Wikipedia Dumps for the english language is provided on https://meta.wikimedia.org/wiki/Data_dump_torrents#English_Wikipedia +- In our model we used enwiki-20170820-pages-articles-multistream.xml.bz2 (14.1 GiB) -### Create a folder structure: +Dependencies: +- wikiExtractor: http://attardi.github.io/wikiextractor +- fasttext: https://github.com/facebookresearch/fastText +- sent2vec: https://github.com/epfml/sent2vec + + +First of all the wikipedia text needs to be extracted from the provided XML. +-extracted file: enwiki-20170820-pages-articles-multistream.xml (21.0GB) + +From the XML the plain text will be extracted using wikiExtractor: +WikiExtractor.py -o OUTPUT-DIRECTORY INPUT-XML-FILE + +_Example_ +WikiExtractor.py -o /wikitext enwiki-20170820-pages-articles-multistream.xml + +WikiExtractor will create several directories AA, AB, AC, ..., CH with a total size of 6.2GB. Each directory contains 100 txt documents (besides CH -> 82). +Each article begins with an ID such as <doc id="12" url="https://en.wikipedia.org/wiki?curid=12" title="Anarchism">. Also comments in Parentheses are provided. +Using preprocess_wikitext.py we delete all IDs, parentheses with their content and also quotes like ' or " and getting a plain wikipedia text. The text file contain one sentence per line. + +_Usage_ +python3 preprocess_wikitext.py input_directory_path output_txt_file_path + +_Examle_ +python3 preprocess_wikitext.py /home/utaemon/Semantik_Projekt/results/ /home/utaemon/Semantik_Projekt/plain/ + +The text file are organized in the same way as in the input text files in directories AA, AB... +To collect all texts into one file collect_all_wiki_in_one.py can be used. In our case the output file will have a total size of 4.1GB. + +_Usage_ +python3 preprocess_wikitext.py input_directory_path output_dir_file_path + +_Example_ +python3 preprocess_wikitext.py /home/utaemon/Semantik_Projekt/plain/ /home/utaemon/Semantik_Projekt/all_plain_texts.txt + + +## Create new sent2vec model + +Move to the sent2vec directory. Here you can run following instruction in the terminal: + +For Uni-grams: +./fasttext sent2vec -input /proj/toyota/all_plain_texts.txt -output /proj/toyota/wiki_model_unigram -minCount 1 -dim 700 -epoch 10 -lr 0.2 -wordNgrams 1 -loss ns -neg 10 -thread 20 -t 0.000005 -dropoutK 4 -minCountLabel 20 -bucket 4000000 + +For Bi-grams: +./fasttext sent2vec -input /proj/toyota/all_plain_texts.txt -output /proj/toyota/wiki_model_bigram -minCount 1 -dim 700 -epoch 10 -lr 0.2 -wordNgrams 2 -loss ns -neg 10 -thread 20 -t 0.000005 -dropoutK 4 -minCountLabel 20 -bucket 4000000 + +In our case it will make a model over 321 million words and a vocabulary containing 4518148 words. + +### Output models: +Both models are provided on /proj/toyota on the server of the Institute of Computer Linguistics Heidelberg. + +Uni-gram model: +wiki_model_unigram.bin (25.4GB) + +Bi-gram model: +wiki_model_bigram.bin (36.6GB) + + +### LICENSES +# wikiExtractor +GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/> + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +# fasttext +BSD License + +For fastText software + +Copyright (c) 2016-present, Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# sent2vec (for code and pre-trained models) +Matteo Pagliardini, Prakhar Gupta, Martin Jaggi, Unsupervised Learning of Sentence Embeddings using Compositional n-Gram Features NAACL 2018. -### RUN THE SYSTEM: diff --git a/lib/preprocess_wikitext.py b/lib/preprocess_wikitext.py index 3f1c49d..e2f55fa 100644 --- a/lib/preprocess_wikitext.py +++ b/lib/preprocess_wikitext.py @@ -3,9 +3,9 @@ """ @author: Tatjana Chernenko, Utaemon Toyota -@usage: python3 preprocess_wikitext.py input_directory_path output_txt_file_path +@usage: python3 preprocess_wikitext.py input_directory_path output_dir_file_path @course: Formale Semantik WS 2017/18 -@description: For each extractet Wikipedia text the programm will preprocess the text and removing e.g. all comments in brackets +@description: For each extractet Wikipedia text the programm will preprocess the text and removing e.g. all comments in parentheses and creating new files with each sentence in one line. """ @@ -18,10 +18,10 @@ import sys rootdir_glob = "/home/utaemon/Semantik_Projekt/results/" plain_path = "/home/utaemon/Semantik_Projekt/plain/" -def remove_apostrophs(text): +def remove_quotes(text): new_text = "" for token in text.split(): - # --- removes all unnecessary apostrophs + # --- removes all unnecessary quotes temp_punct = "" if re.findall("(\?|:|!|\.|,|;)$", token): temp_punct = token[-1] @@ -77,7 +77,7 @@ def get_plain_text(file_path): continue else: text += line + " " - text = remove_apostrophs(text) + text = remove_quotes(text) text = eliminate_brackets(text, "\(", "\)") text = re.sub("\.\.+","", text) text = re.sub(r'\s+(\?|:|!|\.|,|;)', r'\1', text) @@ -100,7 +100,6 @@ def split_lines(function): #--------------write file - def write_plain_file(input_path = rootdir_glob, target_path=plain_path): file_list = [f for f in Path(input_path).glob('**/*') if f.is_file()] for file in file_list: @@ -127,4 +126,4 @@ if __name__ == "__main__": elif (len(sys.argv)) == 3: write_plain_file(sys.argv[1],sys.argv[2]) else: - print("@usage: python3 input_directory_path output_txt_file_path") \ No newline at end of file + print("@usage: python3 input_directory_path output_txt_file_path") -- GitLab