trunk: changes(many "stolen" from babel/s5b) to the VoxForge recipe in order...

trunk: changes(many "stolen" from babel/s5b) to the VoxForge recipe in order to hopefully make it more OS X compatible. Not tested yet, so use with caution. The setup is now using SRILM instead of MITLM, and in general relies more on installation scripts from tools/. git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4874 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8

trunk: changes(many "stolen" from babel/s5b) to the VoxForge recipe in order...
1d9b858a · Vassil Panayotov · c5b33798 · 1d9b858a · 1d9b858a · 1d9b858a
Commit 1d9b858a authored Feb 11, 2015 by Vassil Panayotov
--- a/egs/voxforge/s5/local/voxforge_data_prep.sh
+++ b/egs/voxforge/s5/local/voxforge_data_prep.sh
@@ -47,7 +47,7 @@ fi

 utils/shuffle_list.pl <$loctmp/speakers_all.txt | head -n $nspk_test | sort -u >$loctmp/speakers_test.txt

-gawk 'NR==FNR{spk[$0]; next} !($0 in spk)' \
+awk 'NR==FNR{spk[$0]; next} !($0 in spk)' \
    $loctmp/speakers_test.txt $loctmp/speakers_all.txt |\
  sort -u > $loctmp/speakers_train.txt

@@ -55,14 +55,14 @@ wc -l $loctmp/speakers_all.txt
 wc -l $loctmp/speakers_{train,test}.txt

 # expand speaker names to their respective directories
-ls -d ${DATA}/*/ |\
+find ${DATA}/ -mindepth 1 -maxdepth 1 -type d |\
 while read d; do  basename $d; done |\
- gawk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
+ awk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
  $loctmp/speakers_test.txt - | sort > $loctmp/dir_test.txt

-ls -d ${DATA}/*/ |\
+find ${DATA}/ -mindepth 1 -maxdepth 1 -type d |\
 while read d; do  basename $d; done |\
- gawk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
+ awk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
  $loctmp/speakers_train.txt - | sort > $loctmp/dir_train.txt

 logdir=exp/data_prep
@@ -129,11 +129,11 @@ for s in test train; do
 done < $loctmp/dir_${s}.txt

 # filter out the audio for which there is no proper transcript
- gawk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
+ awk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
   ${loctmp}/${s}_trans.txt.unsorted ${loctmp}/${s}_wav.scp.unsorted |\
   sort -k1 > ${locdata}/${s}_wav.scp
 
- gawk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
+ awk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
   ${loctmp}/${s}_trans.txt.unsorted $loctmp/${s}.utt2spk.unsorted |\
   sort -k1 > ${locdata}/${s}.utt2spk
 
@@ -142,7 +142,7 @@ for s in test train; do
 echo "--- Preparing ${s}.spk2utt ..."
 cat $locdata/${s}_trans.txt |\
  cut -f1 -d' ' |\
-  gawk 'BEGIN {FS="-"}
+  awk 'BEGIN {FS="-"}
        {names[$1]=names[$1] " " $0;}
        END {for (k in names) {print k, names[k];}}' | sort -k1 > $locdata/${s}.spk2utt
 done;
@@ -153,7 +153,7 @@ if [ "${trans_err}" -ge 1 ]; then
  echo " Check ${logdir}/make_trans.log for details!" 
 fi

-gawk '{spk[$1]=$2;} END{for (s in spk) print s " " spk[s]}' \
+awk '{spk[$1]=$2;} END{for (s in spk) print s " " spk[s]}' \
  $locdata/spk2gender.tmp | sort -k1 > $locdata/spk2gender

 echo "*** Initial VoxForge data preparation finished!"
--- a/egs/voxforge/s5/local/voxforge_prepare_dict.sh
+++ b/egs/voxforge/s5/local/voxforge_prepare_dict.sh
@@ -3,6 +3,8 @@
 # Copyright 2012 Vassil Panayotov
 # Apache 2.0

+. path.sh || exit 1
+
 locdata=data/local
 locdict=$locdata/dict

@@ -21,33 +23,17 @@ perl $locdict/cmudict/scripts/make_baseform.pl \
  sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $locdict/cmudict-plain.txt

 echo "--- Searching for OOV words ..."
-gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
+awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
  $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\
  egrep -v '<.?s>' > $locdict/vocab-oov.txt

-gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
+awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
  $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\
  egrep -v '<.?s>' > $locdict/lexicon-iv.txt

 wc -l $locdict/vocab-oov.txt
 wc -l $locdict/lexicon-iv.txt

-pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
-if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
-  echo "--- Downloading Sequitur G2P ..."
-  echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!"
-  wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz
-  tar xf tools/g2p-r1668.tar.gz -C tools
-  cd tools/g2p
-  echo '#include <cstdio>' >> Utility.hh # won't compile on my system w/o this "patch"
-  python setup.py install --prefix=.
-  cd ../..
-  if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
-    echo "Sequitur G2P is not found - installation failed?"
-    exit 1
-  fi
-fi
-
 if [ ! -f conf/g2p_model ]; then
  echo "--- Downloading a pre-trained Sequitur G2P model ..."
  wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
@@ -57,10 +43,24 @@ if [ ! -f conf/g2p_model ]; then
  fi
 fi

+if [[ "$(uname)" == "Darwin" ]]; then
+  command -v greadlink >/dev/null 2>&1 || \
+    { echo "Mac OS X detected and 'greadlink' not found - please install using macports or homebrew"; exit 1; }
+  alias readlink=greadlink
+fi
+
+sequitur=$KALDI_ROOT/tools/sequitur
+export PATH=$PATH:$sequitur/bin
+export PYTHONPATH=$PYTHONPATH:`readlink -f $sequitur/lib/python*/site-packages`
+
+if ! g2p=`which g2p.py` ; then
+  echo "The Sequitur was not found !"
+  echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
+  exit 1
+fi
+
 echo "--- Preparing pronunciations for OOV words ..."
-export PYTHONPATH=`readlink -f tools/g2p/lib/python*/site-packages`
-python tools/g2p/g2p.py \
-  --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt
+g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt

 cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\
  sort > $locdict/lexicon.txt

--- a/egs/voxforge/s5/local/voxforge_prepare_lm.sh
+++ b/egs/voxforge/s5/local/voxforge_prepare_lm.sh
@@ -3,6 +3,8 @@
 # Copyright 2012 Vassil Panayotov
 # Apache 2.0

+. path.sh || exit 1
+
 echo "=== Building a language model ..."

 locdata=data/local
@@ -26,26 +28,26 @@ cut -f2- -d' ' < $locdata/train_trans.txt |\
   sed -e 's:[ ]\+: :g' |\
   sort -u > $loctmp/corpus.txt

-if [ ! -f "tools/mitlm-svn/bin/estimate-ngram" ]; then
-  echo "--- Downloading and compiling MITLM toolkit ..."
-  mkdir -p tools
-  command -v svn >/dev/null 2>&1 ||\
-    { echo "SVN client is needed but not found" ; exit 1; }
-  svn checkout -r103 http://mitlm.googlecode.com/svn/trunk/ tools/mitlm-svn
-  cd tools/mitlm-svn/
-  F77=gfortran ./autogen.sh
-  ./configure --prefix=`pwd`
-  make
-  make install
-  cd ../..
-fi

-echo "--- Estimating the LM ..."
-if [ ! -f "tools/mitlm-svn/bin/estimate-ngram" ]; then
-  echo "estimate-ngram not found! MITLM compilation failed?";
-  exit 1;
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 
+  else
+    sdir=$KALDI_ROOT/tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
  fi
-tools/mitlm-svn/bin/estimate-ngram -t $loctmp/corpus.txt -o $order \
- -write-vocab $locdata/vocab-full.txt -wl $locdata/lm.arpa
+fi
+
+ngram-count -order $order -write-vocab $locdata/vocab-full.txt -wbdiscount \
+  -text $loctmp/corpus.txt -lm $locdata/lm.arpa

 echo "*** Finished building the LM model!"
--- a/egs/voxforge/s5/run.sh
+++ b/egs/voxforge/s5/run.sh
@@ -56,7 +56,7 @@ local/voxforge_map_anonymous.sh ${selected} || exit 1
 # Initial normalization of the data
 local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${selected} || exit 1

-# Download MITLM and prepare an ARPA LM
+# Prepare ARPA LM and vocabulary using SRILM
 local/voxforge_prepare_lm.sh --order ${lm_order} || exit 1

 # Prepare the lexicon and various phone lists