Skip to content
Snippets Groups Projects
Commit 1d9b858a authored by Vassil Panayotov's avatar Vassil Panayotov
Browse files

trunk: changes(many "stolen" from babel/s5b) to the VoxForge recipe in order...

trunk: changes(many "stolen" from babel/s5b) to the VoxForge recipe in order to hopefully make it more OS X compatible.
       Not tested yet, so use with caution. The setup is now using SRILM instead of MITLM, and in general relies more
       on installation scripts from tools/.


git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@4874 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent c5b33798
Branches
No related tags found
No related merge requests found
......@@ -47,7 +47,7 @@ fi
utils/shuffle_list.pl <$loctmp/speakers_all.txt | head -n $nspk_test | sort -u >$loctmp/speakers_test.txt
gawk 'NR==FNR{spk[$0]; next} !($0 in spk)' \
awk 'NR==FNR{spk[$0]; next} !($0 in spk)' \
$loctmp/speakers_test.txt $loctmp/speakers_all.txt |\
sort -u > $loctmp/speakers_train.txt
......@@ -55,14 +55,14 @@ wc -l $loctmp/speakers_all.txt
wc -l $loctmp/speakers_{train,test}.txt
# expand speaker names to their respective directories
ls -d ${DATA}/*/ |\
find ${DATA}/ -mindepth 1 -maxdepth 1 -type d |\
while read d; do basename $d; done |\
gawk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
awk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
$loctmp/speakers_test.txt - | sort > $loctmp/dir_test.txt
ls -d ${DATA}/*/ |\
find ${DATA}/ -mindepth 1 -maxdepth 1 -type d |\
while read d; do basename $d; done |\
gawk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
awk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \
$loctmp/speakers_train.txt - | sort > $loctmp/dir_train.txt
logdir=exp/data_prep
......@@ -129,11 +129,11 @@ for s in test train; do
done < $loctmp/dir_${s}.txt
# filter out the audio for which there is no proper transcript
gawk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
awk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
${loctmp}/${s}_trans.txt.unsorted ${loctmp}/${s}_wav.scp.unsorted |\
sort -k1 > ${locdata}/${s}_wav.scp
gawk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
awk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \
${loctmp}/${s}_trans.txt.unsorted $loctmp/${s}.utt2spk.unsorted |\
sort -k1 > ${locdata}/${s}.utt2spk
......@@ -142,7 +142,7 @@ for s in test train; do
echo "--- Preparing ${s}.spk2utt ..."
cat $locdata/${s}_trans.txt |\
cut -f1 -d' ' |\
gawk 'BEGIN {FS="-"}
awk 'BEGIN {FS="-"}
{names[$1]=names[$1] " " $0;}
END {for (k in names) {print k, names[k];}}' | sort -k1 > $locdata/${s}.spk2utt
done;
......@@ -153,7 +153,7 @@ if [ "${trans_err}" -ge 1 ]; then
echo " Check ${logdir}/make_trans.log for details!"
fi
gawk '{spk[$1]=$2;} END{for (s in spk) print s " " spk[s]}' \
awk '{spk[$1]=$2;} END{for (s in spk) print s " " spk[s]}' \
$locdata/spk2gender.tmp | sort -k1 > $locdata/spk2gender
echo "*** Initial VoxForge data preparation finished!"
......@@ -3,6 +3,8 @@
# Copyright 2012 Vassil Panayotov
# Apache 2.0
. path.sh || exit 1
locdata=data/local
locdict=$locdata/dict
......@@ -21,33 +23,17 @@ perl $locdict/cmudict/scripts/make_baseform.pl \
sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $locdict/cmudict-plain.txt
echo "--- Searching for OOV words ..."
gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
$locdict/cmudict-plain.txt $locdata/vocab-full.txt |\
egrep -v '<.?s>' > $locdict/vocab-oov.txt
gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
$locdata/vocab-full.txt $locdict/cmudict-plain.txt |\
egrep -v '<.?s>' > $locdict/lexicon-iv.txt
wc -l $locdict/vocab-oov.txt
wc -l $locdict/lexicon-iv.txt
pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
echo "--- Downloading Sequitur G2P ..."
echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!"
wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz
tar xf tools/g2p-r1668.tar.gz -C tools
cd tools/g2p
echo '#include <cstdio>' >> Utility.hh # won't compile on my system w/o this "patch"
python setup.py install --prefix=.
cd ../..
if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then
echo "Sequitur G2P is not found - installation failed?"
exit 1
fi
fi
if [ ! -f conf/g2p_model ]; then
echo "--- Downloading a pre-trained Sequitur G2P model ..."
wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
......@@ -57,10 +43,24 @@ if [ ! -f conf/g2p_model ]; then
fi
fi
if [[ "$(uname)" == "Darwin" ]]; then
command -v greadlink >/dev/null 2>&1 || \
{ echo "Mac OS X detected and 'greadlink' not found - please install using macports or homebrew"; exit 1; }
alias readlink=greadlink
fi
sequitur=$KALDI_ROOT/tools/sequitur
export PATH=$PATH:$sequitur/bin
export PYTHONPATH=$PYTHONPATH:`readlink -f $sequitur/lib/python*/site-packages`
if ! g2p=`which g2p.py` ; then
echo "The Sequitur was not found !"
echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
exit 1
fi
echo "--- Preparing pronunciations for OOV words ..."
export PYTHONPATH=`readlink -f tools/g2p/lib/python*/site-packages`
python tools/g2p/g2p.py \
--model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt
g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt
cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\
sort > $locdict/lexicon.txt
......
......@@ -3,6 +3,8 @@
# Copyright 2012 Vassil Panayotov
# Apache 2.0
. path.sh || exit 1
echo "=== Building a language model ..."
locdata=data/local
......@@ -26,26 +28,26 @@ cut -f2- -d' ' < $locdata/train_trans.txt |\
sed -e 's:[ ]\+: :g' |\
sort -u > $loctmp/corpus.txt
if [ ! -f "tools/mitlm-svn/bin/estimate-ngram" ]; then
echo "--- Downloading and compiling MITLM toolkit ..."
mkdir -p tools
command -v svn >/dev/null 2>&1 ||\
{ echo "SVN client is needed but not found" ; exit 1; }
svn checkout -r103 http://mitlm.googlecode.com/svn/trunk/ tools/mitlm-svn
cd tools/mitlm-svn/
F77=gfortran ./autogen.sh
./configure --prefix=`pwd`
make
make install
cd ../..
fi
echo "--- Estimating the LM ..."
if [ ! -f "tools/mitlm-svn/bin/estimate-ngram" ]; then
echo "estimate-ngram not found! MITLM compilation failed?";
exit 1;
loc=`which ngram-count`;
if [ -z $loc ]; then
if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64
else
sdir=$KALDI_ROOT/tools/srilm/bin/i686
fi
if [ -f $sdir/ngram-count ]; then
echo Using SRILM tools from $sdir
export PATH=$PATH:$sdir
else
echo You appear to not have SRILM tools installed, either on your path,
echo or installed in $sdir. See tools/install_srilm.sh for installation
echo instructions.
exit 1
fi
tools/mitlm-svn/bin/estimate-ngram -t $loctmp/corpus.txt -o $order \
-write-vocab $locdata/vocab-full.txt -wl $locdata/lm.arpa
fi
ngram-count -order $order -write-vocab $locdata/vocab-full.txt -wbdiscount \
-text $loctmp/corpus.txt -lm $locdata/lm.arpa
echo "*** Finished building the LM model!"
......@@ -56,7 +56,7 @@ local/voxforge_map_anonymous.sh ${selected} || exit 1
# Initial normalization of the data
local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${selected} || exit 1
# Download MITLM and prepare an ARPA LM
# Prepare ARPA LM and vocabulary using SRILM
local/voxforge_prepare_lm.sh --order ${lm_order} || exit 1
# Prepare the lexicon and various phone lists
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment