Various fixes in s5 scripts.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@873 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8

Various fixes in s5 scripts.
51a66d3c · Dan Povey · 92d97826 · 51a66d3c · 51a66d3c · 51a66d3c
Commit 51a66d3c authored 12 years ago by Dan Povey
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
 #!/bin/bash

+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+          ## This relates to the queue.
+
+
+if false; then ##TEMP
 # This is a shell script, but it's recommended that you run the commands one by
 # one by copying and pasting into the shell.

-local/wsj_data_prep.sh /mnt/matylda2/data/WSJ?/??-{?,??}.? || exit 1;
+# local/wsj_data_prep.sh /mnt/matylda2/data/WSJ?/??-{?,??}.? || exit 1;

-#local/wsj_data_prep.sh  /export/corpora5/LDC/LDC{93S6,94S13}B/??-{?,??}.? || exit 1;
+local/wsj_data_prep.sh  /export/corpora5/LDC/LDC{93S6,94S13}B/??-{?,??}.? || exit 1;

 local/wsj_prepare_dict.sh || exit 1;

@@ -28,9 +33,6 @@ local/wsj_format_data.sh || exit 1;
 #  local/wsj_train_rnnlms.sh
 # ) &

-. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
-          ## This relates to the queue.
-
 # Now make MFCC features.
 # mfccdir should be some place with a largish disk where you
 # want to store MFCC features.
@@ -66,19 +68,13 @@ steps/align_si.sh --nj 10 --cmd "$train_cmd" \
 steps/train_deltas.sh --cmd "$train_cmd" \
    2000 10000 data/train_si84_half data/lang exp/mono0a_ali exp/tri1 || exit 1;

-# TEMP
-  steps/train_deltas.sh --cmd "$train_cmd" \
-    2000 10000 data/train_si84_half data/lang.clustsil exp/mono0a_ali exp/tri1.clustsil
-  utils/mkgraph.sh data/lang_test_tgpr exp/tri1.clustsil exp/tri1.clustsil/graph_tgpr || exit 1;
-  steps/decode_si.sh --nj 10 --cmd "$decode_cmd" \
-    exp/tri1.clustsil/graph_tgpr data/test_dev93 exp/tri1.clustsil/decode_tgpr_dev93 || exit 1;
-
-
 wait; # or the mono mkgraph.sh might be writing 
 # data/lang_test_tgpr/tmp/LG.fst which will cause this to fail.

 utils/mkgraph.sh data/lang_test_tgpr exp/tri1 exp/tri1/graph_tgpr || exit 1;

+fi  ##TEMP
+
 steps/decode_si.sh --nj 10 --cmd "$decode_cmd" \
  exp/tri1/graph_tgpr data/test_dev93 exp/tri1/decode_tgpr_dev93 || exit 1;
 steps/decode_si.sh --nj 8 --cmd "$decode_cmd" \
@@ -153,8 +149,10 @@ steps/align_si.sh  --nj 10 --cmd "$train_cmd" \


 # Train and test MMI (and boosted MMI) on tri2b system.
-steps/make_denlats_si.sh --nj 10 --cmd "$train_cmd" \
-  data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 || exit 1;
+steps/make_denlats.sh --nj 10 --cmd "$train_cmd" \
+  data/train_si84 data/lang exp/tri2b exp/tri2b_denlats_si84 || exit 1;
+
+# I AM HERE

 steps/train_lda_etc_mmi.sh --nj 10  --cmd "$train_cmd" \
  data/train_si84 data/lang exp/tri2b_ali_si84 \
@@ -169,7 +167,7 @@ utils/decode.sh --cmd "$decode_cmd" steps/decode_lda_mllt.sh \
   exp/tri2b/graph_tgpr data/test_eval92 exp/tri2b_mmi_b0.1/decode_tgpr_eval92 || exit 1;

 (
-# HERE-- new
+  # HERE-- new
  steps/train_lda_etc_dmmi.sh --nj 10  --cmd "$train_cmd" \
   data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri2b_denlats_si84 \
   exp/tri2b exp/tri2b_dmmi_-1.0_0.1

--- a/egs/wsj/s5/steps/align_si.sh
+++ b/egs/wsj/s5/steps/align_si.sh
@@ -21,7 +21,7 @@ retry_beam=40
 # End configuration options.

 [ -f path.sh ] && . ./path.sh # source the path.
-. parse_options.sh
+. parse_options.sh || exit 1;

 if [ $# != 4 ]; then
   echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
@@ -48,10 +48,9 @@ sdata=$data/split$nj
 cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;


-if [ -z $feat_type ]; then
-  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
-  echo "align_si.sh: feature type is $feat_type"
-fi
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "align_si.sh: feature type is $feat_type"
+
 case $feat_type in
  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"

--- a/egs/wsj/s5/steps/make_denlats.sh
+++ b/egs/wsj/s5/steps/make_denlats.sh
+#!/bin/bash
+# Copyright 2012  Daniel Povey.  Apache 2.0.
+
+# Create denominator lattices for MMI/MPE training.
+# This version uses speaker independent features.
+# output in $dir/*.lats.gz
+
+# Begin configuration.
+nj=4
+cmd=run.pl
+sub_split=1
+beam=13.0
+lattice_beam=7.0
+acwt=0.1
+max_active=5000
+transform_dir=
+max_mem=20000000 # This will stop the processes getting too large 
+# (default is 50M, but this can result in the process getting up to 2G
+#  ... the units are not quite "real" units due to severe inaccuracies in the
+# way that program measures how much memory it is using).
+# End configuration.
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: steps/make_denlats_si.sh [options] <data-dir> <lang-dir> <src-dir> <exp-dir>"
+   echo "  e.g.: steps/make_denlats_lda_etc.sh data/train data/lang exp/tri1_denlats"
+   echo "Works for (delta|lda) features, and (with --transform-dir option) such features"
+   echo " plus transforms."
+   echo ""
+   echo "Main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --sub-split <n-split>                            # e.g. 40; use this for "
+   echo "                           # large databases so your jobs will be smaller and"
+   echo "                           # will (individually) finish reasonably soon."
+   echo "  --transform-dir <transform-dir>   # directory to find fMLLR transforms."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+sdata=$data/split$nj
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+oov=`cat $lang/oov.int` || exit 1;
+
+mkdir -p $dir
+
+cp -r $lang $dir/
+
+# Compute grammar FST which corresponds to unigram decoding graph.
+
+cat $data/text | utils/sym2int.pl --map-oov "$oov" -f 2- $lang/words.txt | \
+  awk '{for(n=2;n<=NF;n++){ printf("%s ", $n); } printf("\n"); }' | \
+  utils/make_unigram_grammar.pl | fstcompile > $dir/lang/G.fst \
+   || exit 1;
+
+# mkgraph.sh expects a whole directory "lang", so put everything in one directory...
+# it gets L_disambig.fst and G.fst (among other things) from $dir/lang, and
+# final.mdl from $srcdir; the output HCLG.fst goes in $dir/graph.
+
+
+if [ -s $dir/dengraph/HCLG.fst ]; then
+   echo "Graph $dir/dengraph/HCLG.fst already exists: skipping graph creation."
+else
+  utils/mkgraph.sh $dir/lang $srcdir $dir/dengraph || exit 1;
+fi
+
+if [ -z $feat_type ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+  echo "align_si.sh: feature type is $feat_type"
+fi
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+if [ ! -z "$transform_dir" ]; then # add transforms to features...
+  echo "Using fMLLR transforms from $transform_dir"
+  [ ! -f $transform_dir/1.trans ] && echo "Expected $transform_dir/1.trans to exist."
+  [ "`cat $transform_dir/num_jobs`" -ne $nj ] && echo "Mismatch in number of jobs with $transform_dir";
+  [ -f $srcdir/final.mat ] && ! cmp $transform_dir/final.mat $srcdir/final.mat && \
+     echo "LDA transforms differ between $srcdir and $transform_dir"
+  feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk $transform_dir/JOB.trans ark:- ark:- |"
+fi
+
+
+if [ $sub_split -eq 1 ]; then 
+  $cmd JOB=1:$nj $dir/log/decode_den.JOB.log \
+   gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+    --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+     $dir/dengraph/HCLG.fst "$feats" "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1;
+else
+  for n in `seq $nj`; do
+    if [ -f $dir/.done.$n ]; then
+      echo "Not processing subset $n as already done (delete $dir/.done.$n if not)";
+    else 
+      touch $dir/.done.$n;
+      sdata2=$data/split$nj/$n/split$sub_split;
+      if [ ! -d $sdata2 ] || [ $sdata2 -ot $sdata/$n/feats.scp ]; then
+        split_data.sh --per-utt $sdata/$n $sub_split || exit 1;
+      fi
+      mkdir -p $dir/log/$n
+      mkdir -p $dir/part
+      feats_subset=`echo $feats | sed s:JOB/:$n/split$sub_split/JOB/:g`
+      $cmd JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \
+        gmm-latgen-faster --beam=$beam --lattice-beam=$lattice_beam --acoustic-scale=$acwt \
+        --max-mem=$max_mem --max-active=$max_active --word-symbol-table=$lang/words.txt $srcdir/final.mdl  \
+          $dir/dengraph/HCLG.fst "$feats_subset" "ark:|gzip -c >$dir/lat.$n.JOB.gz" || exit 1;
+      echo Merging archives for data subset $n
+      rm $dir/.error 2>/dev/null;
+      for k in `seq $sub_split`; do
+        gunzip -c $dir/lat.$n.$k.gz || touch $dir/.error;
+      done | gzip -c > $dir/lat.$n.gz || touch $dir/.error;
+      [ -f $dir/.error ] && echo Merging lattices for subset $n failed && exit 1;
+      rm $dir/lat.$n.*.gz
+      touch $dir/.done.$n
+    fi
+  done
+fi
+
+
+echo "Done generating denominator lattices."
+
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -35,8 +35,6 @@ data=$1
 lang=$2
 dir=$3

-[ ! -z $config ] && . $config # Override any of the above, if --config specified.
-
 oov_sym=`cat $lang/oov.int` || exit 1;

 mkdir -p $dir/log

--- a/egs/wsj/s5/utils/parse_options.sh
+++ b/egs/wsj/s5/utils/parse_options.sh
@@ -30,7 +30,11 @@ while true; do
 esac
 done

-[ ! -z $config ] && . $config # Override any of the options, if --config was specified.
+[ ! -z "$config" ] && . $config # Override any of the options, if --config was specified.
+
+# Check for an empty argument to the --cmd option, which can easily occur as a result
+# of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" && exit 1;

 true; # so this script returns code zero.