Skip to content
Snippets Groups Projects
Commit 51a66d3c authored by Dan Povey's avatar Dan Povey
Browse files

Various fixes in s5 scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@873 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 92d97826
No related branches found
No related tags found
No related merge requests found
#!/bin/bash
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
if false; then ##TEMP
# This is a shell script, but it's recommended that you run the commands one by
# one by copying and pasting into the shell.
local/wsj_data_prep.sh /mnt/matylda2/data/WSJ?/??-{?,??}.? || exit 1;
# local/wsj_data_prep.sh /mnt/matylda2/data/WSJ?/??-{?,??}.? || exit 1;
#local/wsj_data_prep.sh /export/corpora5/LDC/LDC{93S6,94S13}B/??-{?,??}.? || exit 1;
local/wsj_data_prep.sh /export/corpora5/LDC/LDC{93S6,94S13}B/??-{?,??}.? || exit 1;
local/wsj_prepare_dict.sh || exit 1;
......@@ -28,9 +33,6 @@ local/wsj_format_data.sh || exit 1;
# local/wsj_train_rnnlms.sh
# ) &
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
# Now make MFCC features.
# mfccdir should be some place with a largish disk where you
# want to store MFCC features.
......@@ -66,19 +68,13 @@ steps/align_si.sh --nj 10 --cmd "$train_cmd" \
steps/train_deltas.sh --cmd "$train_cmd" \
2000 10000 data/train_si84_half data/lang exp/mono0a_ali exp/tri1 || exit 1;
# TEMP
steps/train_deltas.sh --cmd "$train_cmd" \
2000 10000 data/train_si84_half data/lang.clustsil exp/mono0a_ali exp/tri1.clustsil
utils/mkgraph.sh data/lang_test_tgpr exp/tri1.clustsil exp/tri1.clustsil/graph_tgpr || exit 1;
steps/decode_si.sh --nj 10 --cmd "$decode_cmd" \
exp/tri1.clustsil/graph_tgpr data/test_dev93 exp/tri1.clustsil/decode_tgpr_dev93 || exit 1;
wait; # or the mono mkgraph.sh might be writing
# data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.
utils/mkgraph.sh data/lang_test_tgpr exp/tri1 exp/tri1/graph_tgpr || exit 1;
fi ##TEMP
steps/decode_si.sh --nj 10 --cmd "$decode_cmd" \
exp/tri1/graph_tgpr data/test_dev93 exp/tri1/decode_tgpr_dev93 || exit 1;
steps/decode_si.sh --nj 8 --cmd "$decode_cmd" \
......@@ -153,8 +149,10 @@ steps/align_si.sh --nj 10 --cmd "$train_cmd" \
# Train and test MMI (and boosted MMI) on tri2b system.
steps/make_denlats_si.sh --nj 10 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 || exit 1;
steps/make_denlats.sh --nj 10 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b exp/tri2b_denlats_si84 || exit 1;
# I AM HERE
steps/train_lda_etc_mmi.sh --nj 10 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 \
......@@ -169,7 +167,7 @@ utils/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt.sh \
exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92 || exit 1;
(
# HERE-- new
# HERE-- new
steps/train_lda_etc_dmmi.sh --nj 10 --cmd "$train_cmd" \
data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 \
exp/tri2b exp/tri2b_dmmi_-1.0_0.1
......
......@@ -21,7 +21,7 @@ retry_beam=40
# End configuration options.
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
......@@ -48,10 +48,9 @@ sdata=$data/split$nj
cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;
if [ -z $feat_type ]; then
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
fi
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
......
#!/bin/bash
# Copyright 2012 Daniel Povey. Apache 2.0.
# Create denominator lattices for MMI/MPE training.
# This version uses speaker independent features.
# output in $dir/*.lats.gz
# Begin configuration.
nj=4
cmd=run.pl
sub_split=1
beam=13.0
lattice_beam=7.0
acwt=0.1
max_active=5000
transform_dir=
max_mem=20000000 # This will stop the processes getting too large
# (default is 50M, but this can result in the process getting up to 2G
# ... the units are not quite "real" units due to severe inaccuracies in the
# way that program measures how much memory it is using).
# End configuration.
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 4 ]; then
echo "Usage: steps/make_denlats_si.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
echo " e.g.: steps/make_denlats_lda_etc.sh data/train data/lang exp/tri1_denlats"
echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
echo " plus transforms."
echo ""
echo "Main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --sub-split <n-split> # e.g. 40; use this for "
echo " # large databases so your jobs will be smaller and"
echo " # will (individually) finish reasonably soon."
echo " --transform-dir <transform-dir> # directory to find fMLLR transforms."
exit 1;
fi
data=$1
lang=$2
srcdir=$3
dir=$4
sdata=$data/split$nj
mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
oov=`cat $lang/oov.int` || exit 1;
mkdir -p $dir
cp -r $lang $dir/
# Compute grammar FST which corresponds to unigram decoding graph.
cat $data/text | utils/sym2int.pl --map-oov "$oov" -f 2- $lang/words.txt | \
awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \
|| exit 1;
# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
if [ -s $dir/dengraph/HCLG.fst ]; then
echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
else
utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1;
fi
if [ -z $feat_type ]; then
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
fi
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
cp $srcdir/final.mat $dir
;;
*) echo "Invalid feature type $feat_type" && exit 1;
esac
if [ ! -z "$transform_dir" ]; then # add transforms to features...
echo "Using fMLLR transforms from $transform_dir"
[ ! -f $transform_dir/1.trans ] && echo "Expected $transform_dir/1.trans to exist."
[ "`cat $transform_dir/num_jobs`" -ne $nj ] && echo "Mismatch in number of jobs with $transform_dir";
[ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
echo "LDA transforms differ between $srcdir and $transform_dir"
feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk $transform_dir/JOB.trans ark:- ark:- |"
fi
if [ $sub_split -eq 1 ]; then
$cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
--max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \
$dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
else
for n in `seq $nj`; do
if [ -f $dir/.done.$n ]; then
echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
else
touch $dir/.done.$n;
sdata2=$data/split$nj/$n/split$sub_split;
if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
fi
mkdir -p $dir/log/$n
mkdir -p $dir/part
feats_subset=`echo $feats | sed s:JOB/:$n/split$sub_split/JOB/:g`
$cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
--max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl \
$dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
echo Merging archives for data subset $n
rm $dir/.error 2>/dev/null;
for k in `seq $sub_split`; do
gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
[ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
rm $dir/lat.$n.*.gz
touch $dir/.done.$n
fi
done
fi
echo "Done generating denominator lattices."
......@@ -35,8 +35,6 @@ data=$1
lang=$2
dir=$3
[ ! -z $config ] && . $config # Override any of the above, if --config specified.
oov_sym=`cat $lang/oov.int` || exit 1;
mkdir -p $dir/log
......
......@@ -30,7 +30,11 @@ while true; do
esac
done
[ ! -z $config ] && . $config # Override any of the options, if --config was specified.
[ ! -z "$config" ] && . $config # Override any of the options, if --config was specified.
# Check for an empty argument to the --cmd option, which can easily occur as a result
# of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" && exit 1;
true; # so this script returns code zero.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment