Adding some scripts (train_sgmm.sh not finished yet!)

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@904 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8

Adding some scripts (train_sgmm.sh not finished yet!)
356538a1 · Dan Povey · 143fd26d · 356538a1 · 356538a1 · 356538a1
Commit 356538a1 authored 12 years ago by Dan Povey
--- a/egs/wsj/s5/steps/train_quick.sh
+++ b/egs/wsj/s5/steps/train_quick.sh
+#!/bin/bash
+# Copyright 2012  Daniel Povey.  Apache 2.0.
+
+
+# Train a model on top of existing features (no feature-space learning of any
+# kind is done).  This script initializes the model from each stage of the
+# previous system's model, judging the similarities based on overlap of counts
+# in the tree stats.
+
+# Begin configuration..
+cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 15"; # Only realign twice.
+num_iters=20    # Number of iterations of training
+maxiterinc=15 # Last iter to increase #Gauss on.
+batch_size=750 # batch size to use while compiling graphs... memory/speed tradeoff.
+beam=10 # alignment beam.
+retry_beam=40
+stage=-5
+# End configuration section.
+
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 6 ]; then
+  echo "Usage: steps/train_quick.sh <num-leaves> <num-gauss> <data> <lang> <ali-dir> <exp-dir>"
+  echo " e.g.: steps/train_quick.sh 2500 15000 data/train_si284 data/lang exp/tri3c_ali_si284 exp/tri4b"
+  echo "Main options (for others, see top of script file)"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --stage <stage>                                  # stage to do partial re-run from."
+  exit 1;
+fi
+
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+# Set various variables.
+oov=`cat $lang/oov.int`
+silphonelist=`cat $lang/phones/silence.csl`
+ciphonelist=`cat $lang/phones/context_indep.csl`
+numgauss=$[totgauss/2] # Start with half the total number of Gaussians.  We won't have
+  # to mix up much probably, as we're initializing with the old (already mixed-up) pdf's.  
+[ $numgauss -lt $numleaves ] && numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+nj=`cat $alidir/num_jobs` || exit 1;
+sdata=$data/split$nj
+
+mkdir -p $dir/log
+echo $nj >$dir/num_jobs
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+## Set up features.
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+    cp $alidir/final.mat $dir    
+    ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+if [ -f $alidir/1.trans ]; then
+  echo "$0: using transforms from $alidir"
+  ln.pl $alidir/*.trans $dir # Link them to dest dir.
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/JOB.trans ark:- ark:- |"
+fi
+##
+
+
+if [ $stage -le -5 ]; then
+  echo "$0: accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+    acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+    "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1;
+  [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-stats" && exit 1;
+  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
+  rm $dir/*.treeacc
+fi
+
+if [ $stage -le -4 ]; then
+  echo "$0: Getting questions for tree clustering."
+  # preparing questions, roots file...
+  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+
+  echo "$0: Building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree --verbose=1 --max-leaves=$numleaves \
+    $dir/treeacc $lang/phones/roots.int \
+    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+fi
+
+if [ $stage -le -3 ]; then
+  echo "$0: Initializing the model"
+
+  # The gmm-init-model command (with more than the normal # of command-line args)
+  # will initialize the p.d.f.'s to the p.d.f.'s in the alignment model.
+
+  gmm-init-model  --write-occs=$dir/1.occs  \
+    $dir/tree $dir/treeacc $lang/topo $dir/tmp.mdl $alidir/tree $alidir/final.mdl  \
+    2>$dir/log/init_model.log || exit 1;
+
+  grep 'no stats' $dir/log/init_model.log && echo "$0: This is a bad warning.";
+  rm $dir/treeacc
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: mixing up old model."
+  # We do both mixing-down and mixing-up to get the target #Gauss in each state,
+  # since the initial model may have either more or fewer Gaussians than we want.
+  gmm-mixup --mix-down=$numgauss --mix-up=$numgauss $dir/tmp.mdl $dir/1.occs $dir/1.mdl \
+    2> $dir/log/mixup.log || exit 1;
+  rm $dir/tmp.mdl 
+fi
+
+# Convert alignments to the new tree.
+if [ $stage -le -1 ]; then
+  echo "$0: converting old alignments"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+    "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1;
+fi
+
+if [ $stage -le 0 ]; then
+  echo "$0: compiling training graphs"
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    compile-train-graphs --batch-size=$batch_size $dir/tree $dir/1.mdl $lang/L.fst  \
+    "ark:sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
+    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1;
+fi
+
+x=1
+while [ $x -lt $num_iters ]; do
+  echo "$0: pass $x"
+  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
+    echo "$0: aligning data"
+    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
+      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/$x.mdl \
+      "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" "ark:|gzip -c >$dir/ali.JOB.gz" \
+      || exit 1;
+  fi
+  if [ $stage -le $x ]; then
+    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
+      "ark,s,cs:gunzip -c $dir/ali.JOB.gz|"  $dir/$x.JOB.acc || exit 1;
+    [ "`ls $dir/$x.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1;
+    $cmd $dir/log/update.$x.log \
+      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
+      "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
+    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
+  fi
+  [[ $x -le $maxiterinc ]] && numgauss=$[$numgauss+$incgauss];
+  x=$[$x+1];
+done
+
+if [ -f $alidir/1.trans ]; then
+  echo "$0: estimating alignment model"
+  $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \
+    ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:-  \| \
+    gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \
+    ark,s,cs:- $dir/$x.JOB.acc || exit 1;
+  [ "`ls $dir/$x.*.acc | wc -w`" -ne "$nj" ] && echo "$0: wrong #accs" && exit 1;
+
+  $cmd $dir/log/est_alimdl.log \
+    gmm-est --write-occs=$dir/final.occs --remove-low-count-gaussians=false $dir/$x.mdl \
+    "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1;
+  rm $dir/$x.*.acc
+  rm $dir/final.alimdl 2>/dev/null 
+  ln -s $x.alimdl $dir/final.alimdl
+fi
+
+rm $dir/final.mdl 2>/dev/null
+ln -s $x.mdl $dir/final.mdl
+
+echo Done
--- a/egs/wsj/s5/steps/train_sgmm.sh
+++ b/egs/wsj/s5/steps/train_sgmm.sh
+#!/bin/bash
+
+# Copyright 2012  Daniel Povey.  Apache 2.0.
+
+# SGMM training, with speaker vectors.  This script would normally be called on
+# top of fMLLR features obtained from a conventional system, but it also works
+# on top of any type of speaker-independent features (based on
+# deltas+delta-deltas or LDA+MLLT).  For more info on SGMMs, see the paper "The
+# subspace Gaussian mixture model--A structured model for speech recognition".
+# (Computer Speech and Language, 2011).
+
+# Begin configuration section.
+nj=4
+cmd=scripts/run.pl
+stage=-5
+context_opts= # e.g. set it to "--context-width=5 --central-position=2"  for a
+# quinphone system.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+num_iters=25   # Total number of iterations
+num_iters_alimdl=3 # Number of iterations for estimating alignment model.
+maxiterinc=15 # Last iter to increase #substates on.
+realign_iters="5 10 15"; # Iters to realign on. 
+spkvec_iters="5 8 12 17" # Iters to estimate speaker vectors on.
+add_dim_iters="6 8 10 12"; # Iters on which to increase phn dim and/or spk dim,
+rand_prune=0.1 # Randomized-pruning parameter for posteriors, to speed up training.
+phn_dim=  # You can use this to set the phonetic subspace dim. [default: feat-dim+1]
+spk_dim=  # You can use this to set the speaker subspace dim. [default: feat-dim]
+
+
+# End configuration section.
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 7 ]; then
+  echo "Usage: steps/train_sgmm.sh <num-leaves> <num-substates> <data> <lang> <ali-dir> <ubm-dir> <exp-dir>"
+  echo " e.g.: steps/train_sgmm.sh 3500 10000 data/train_si84 data/lang \\"
+  echo "                      exp/tri3b_ali_si84 exp/ubm4a/final.ubm exp/sgmm4a"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
+  echo "  --num-iters <#iters>                             # Number of iterations of E-M"
+  exit 1;
+fi
+
+
+num_leaves=$1
+totsubstates=$2
+data=$3
+lang=$4
+alidir=$5
+ubm=$6
+dir=$7
+
+# Check some files.
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $ubm/final.ubm; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+
+cp $transform_dir/final.mat $dir/final.mat || exit 1;
+
+# Set some variables.
+oov=`cat $lang/oov.int`
+silphonelist=`cat $lang/phones/silence.csl`
+numsubstates=$num_leaves # Initial #-substates.
+incsubstates=$[($totsubstates-$numsubstates)/$maxiterinc] # per-iter increment for #substates
+feat_dim=`gmm-info $alidir/final.model | awk '/feature dimension/{print $NF}'` || exit 1;
+[ $feat_dim -eq $feat_dim ] || exit 1; # make sure it's numeric.
+[ -z $phn_dim ] && phn_dim=$[$feat_dim+1]
+[ -z $spk_dim ] && spk_dim=$feat_dim
+nj=`cat $alidir/num_jobs` || exit 1;
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+
+for n in `get_splits.pl $nj`; do
+  # Initially don't have speaker vectors, but change this after we estimate them.
+  spkvecs_opt[$n]=
+  gselect_opt[$n]="--gselect=ark,s,cs:gunzip -c $dir/$n.gselect.gz|"
+done
+
+
+n1=`get_splits.pl $nj | awk '{print $1}'`
+[ -f $transform_dir/$n1.trans ] && echo "Using speaker transforms from $transform_dir"
+
+for n in `get_splits.pl $nj`; do
+  featspart[$n]="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$transform_dir/$n.cmvn scp:$data/split$nj/$n/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $dir/final.mat ark:- ark:- |"
+  if [ -f $transform_dir/$n1.trans ]; then
+    featspart[$n]="${featspart[$n]} transform-feats --utt2spk=ark:$data/split$nj/$n/utt2spk ark:$transform_dir/$n.trans ark:- ark:- |"
+  fi
+done
+
+
+if [ ! -f $ubm ]; then
+  echo "No UBM in $ubm"
+  exit 1;
+fi
+
+
+if [ $stage -le -5 ]; then
+  # This stage assumes we won't need the context of silence, which
+  # assumes something about $lang/roots.txt, but it seems pretty safe.
+  echo "Accumulating tree stats"
+  rm $dir/.error 2>/dev/null
+  for n in `get_splits.pl $nj`; do
+    $cmd $dir/log/acc_tree.$n.log \
+    acc-tree-stats  $context_opts --ci-phones=$silphonelist $alidir/final.mdl "${featspart[$n]}" \
+      "ark:gunzip -c $alidir/$n.ali.gz|" $dir/$n.treeacc || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo Error accumulating tree stats && exit 1;
+  sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1;
+  rm $dir/*.treeacc
+fi
+
+if [ $stage -le -4 ]; then
+  echo "Computing questions for tree clustering"
+  # preparing questions, roots file...
+  sym2int.pl $lang/phones.txt $lang/phonesets_cluster.txt > $dir/phonesets.txt || exit 1;
+  cluster-phones $context_opts $dir/treeacc $dir/phonesets.txt $dir/questions.txt 2> $dir/log/questions.log || exit 1;
+  sym2int.pl $lang/phones.txt $lang/extra_questions.txt >> $dir/questions.txt
+  compile-questions $context_opts $lang/topo $dir/questions.txt $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+  sym2int.pl --ignore-oov $lang/phones.txt $lang/roots.txt > $dir/roots.txt
+
+  echo "Building tree"
+  $cmd $dir/log/train_tree.log \
+    build-tree $context_opts --verbose=1 --max-leaves=$num_leaves \
+      $dir/treeacc $dir/roots.txt \
+      $dir/questions.qst $lang/topo $dir/tree || exit 1;
+
+  # The next line is a bit of a hack to work out the feature dim.  The program
+  # feat-to-len returns the #rows of each matrix, which for the transform matrix,
+  # is the feature dim.
+  featdim=`feat-to-len "scp:echo foo $transform_dir/final.mat|" ark,t:- 2>/dev/null | awk '{print $2}'`
+  
+  # Note: if phn_dim and/or spk_dim are higher than you can initialize with,
+  # sgmm-init will just make them as high as it can (later we'll increase)
+
+  $cmd $dir/log/init_sgmm.log \
+    sgmm-init --phn-space-dim=$phn_dim --spk-space-dim=$spk_dim $lang/topo $dir/tree $ubm \
+      $dir/0.mdl || exit 1;
+
+fi
+
+rm $dir/.error 2>/dev/null
+
+if [ $stage -le -3 ]; then
+  echo "Doing Gaussian selection"
+  for n in `get_splits.pl $nj`; do
+    $cmd $dir/log/gselect$n.log \
+      sgmm-gselect $dir/0.mdl "${featspart[$n]}" "ark,t:|gzip -c > $dir/$n.gselect.gz" \
+     || touch $dir/.error &
+  done
+  wait;
+  [ -f $dir/.error ] && echo "Error doing Gaussian selection" && exit 1;
+fi
+
+if [ $stage -le -2 ]; then
+  echo "Compiling training graphs"
+  for n in `get_splits.pl $nj`; do
+    $cmd $dir/log/compile_graphs$n.log \
+      compile-train-graphs $dir/tree $dir/0.mdl  $lang/L.fst  \
+       "ark:sym2int.pl --map-oov $oov --ignore-first-field $lang/words.txt < $data/split$nj/$n/text |" \
+       "ark:|gzip -c >$dir/$n.fsts.gz" || touch $dir/.error &
+  done
+  wait;
+  [ -f $dir/.error ] && echo "Error compiling training graphs" && exit 1;
+fi
+
+
+if [ $stage -le -1 ]; then
+  echo "Converting alignments"  # don't bother parallelizing; very fast.
+  for n in `get_splits.pl $nj`; do
+    convert-ali $alidir/final.mdl $dir/0.mdl $dir/tree "ark:gunzip -c $alidir/$n.ali.gz|" \
+       "ark:|gzip -c >$dir/$n.ali.gz" 2>$dir/log/convert$n.log 
+  done
+fi
+
+x=0
+while [ $x -lt $num_iters ]; do
+   echo "Pass $x ... "
+   if echo $realign_iters | grep -w $x >/dev/null; then
+      if [ $stage -le $x ]; then
+        echo "Aligning data"
+        for n in `get_splits.pl $nj`; do
+          $cmd $dir/log/align.$x.$n.log  \
+            sgmm-align-compiled ${spkvecs_opt[$n]} $scale_opts "${gselect_opt[$n]}" \
+               --utt2spk=ark:$data/split$nj/$n/utt2spk --beam=8 --retry-beam=40 \
+               $dir/$x.mdl "ark:gunzip -c $dir/$n.fsts.gz|" "${featspart[$n]}" \
+               "ark:|gzip -c >$dir/$n.ali.gz" || touch $dir/.error &
+        done
+        wait;
+        [ -f $dir/.error ] && echo "Error realigning data on iter $x" && exit 1;
+      fi
+   fi
+   if [ $spk_dim -gt 0 ] && echo $spkvec_iters | grep -w $x >/dev/null; then
+     for n in `get_splits.pl $nj`; do
+       if [ $stage -le $x ]; then
+         $cmd $dir/log/spkvecs.$x.$n.log \
+           ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- \| \
+             weight-silence-post 0.01 $silphonelist $dir/$x.mdl ark:- ark:- \| \
+             sgmm-est-spkvecs --spk2utt=ark:$data/split$nj/$n/spk2utt \
+               ${spkvecs_opt[$n]} "${gselect_opt[$n]}" $dir/$x.mdl \
+            "${featspart[$n]}" ark,s,cs:- ark:$dir/tmp$n.vecs  \
+           && mv $dir/tmp$n.vecs $dir/$n.vecs || touch $dir/.error &
+       fi
+       spkvecs_opt[$n]="--spk-vecs=ark:$dir/$n.vecs"
+     done
+     wait;
+     [ -f $dir/.error ] && echo "Error computing speaker vectors on iter $x" && exit 1;     
+   fi  
+   if [ $x -eq 0 ]; then
+     flags=vwcSt # On first iter, don't update M or N.
+   elif [ $spk_dim -gt 0 -a $[$x%2] -eq 1 -a $x -ge `echo $spkvec_iters | awk '{print $1}'` ]; then 
+     # Update N if we have spk-space and x is odd, and we're at least at 1st spkvec iter.
+     flags=vNwcSt
+   else # Else update M but not N.
+     flags=vMwcSt
+   fi
+
+   if [ $stage -le $x ]; then
+     for n in `get_splits.pl $nj`; do
+       $cmd $dir/log/acc.$x.$n.log \
+         sgmm-acc-stats ${spkvecs_opt[$n]} --utt2spk=ark:$data/split$nj/$n/utt2spk \
+           --update-flags=$flags "${gselect_opt[$n]}" --rand-prune=$randprune \
+           $dir/$x.mdl "${featspart[$n]}" "ark,s,cs:ali-to-post 'ark:gunzip -c $dir/$n.ali.gz|' ark:-|" \
+           $dir/$x.$n.acc || touch $dir/.error &
+     done
+     wait;
+     [ -f $dir/.error ] && echo "Error accumulating stats on iter $x" && exit 1;     
+   fi
+
+   add_dim_opts=
+   if echo $add_dim_iters | grep -w $x >/dev/null; then
+     add_dim_opts="--increase-phn-dim=$phn_dim --increase-spk-dim=$spk_dim"
+   fi
+
+   if [ $stage -le $x ]; then
+     $cmd $dir/log/update.$x.log \
+       sgmm-est --update-flags=$flags --split-substates=$numsubstates $add_dim_opts \
+         --write-occs=$dir/$[$x+1].occs $dir/$x.mdl "sgmm-sum-accs - $dir/$x.*.acc|" \
+       $dir/$[$x+1].mdl || exit 1;
+
+     rm $dir/$x.mdl $dir/$x.*.acc
+     rm $dir/$x.occs 
+   fi
+   if [ $x -lt $maxiterinc ]; then
+     numsubstates=$[$numsubstates+$incsubstates]
+   fi
+   x=$[$x+1];
+done
+
+( cd $dir; rm final.mdl final.occs 2>/dev/null; 
+  ln -s $x.mdl final.mdl; 
+  ln -s $x.occs final.occs )
+
+if [ $spk_dim -gt 0 ]; then
+  # If we have speaker vectors, we need an alignment model.
+  # The point of this last phase of accumulation is to get Gaussian-level
+  # alignments with the speaker vectors but accumulate stats without
+  # any speaker vectors; we re-estimate M, w, c and S to get a model
+  # that's compatible with not having speaker vectors.
+
+  # We do this for a few iters, in this recipe.
+  cur_alimdl=$dir/$x.mdl
+  y=0;
+  while [ $y -lt $num_iters_alimdl ]; do
+    echo "Pass $y of building alignment model"
+    if [ $y -eq 0 ]; then
+      flags=MwcS # First time don't update v...
+    else
+      flags=vMwcS # don't update transitions-- will probably share graph with normal model.
+    fi
+    if [ $stage -le $[$y+100] ]; then
+      for n in `get_splits.pl $nj`; do
+        $cmd $dir/log/acc_ali.$y.$n.log \
+          ali-to-post "ark:gunzip -c $dir/$n.ali.gz|" ark:- \| \
+            sgmm-post-to-gpost ${spkvecs_opt[$n]} "${gselect_opt[$n]}" \
+              --utt2spk=ark:$data/split$nj/$n/utt2spk $dir/$x.mdl "${featspart[$n]}" ark,s,cs:- ark:- \| \
+            sgmm-acc-stats-gpost --update-flags=$flags  $cur_alimdl "${featspart[$n]}" \
+              ark,s,cs:- $dir/$y.$n.aliacc || touch $dir/.error &
+      done
+      wait;
+      [ -f $dir/.error ] && echo "Error accumulating stats for alignment model on iter $y" && exit 1;
+      $cmd $dir/log/update_ali.$y.log \
+         sgmm-est --update-flags=$flags --remove-speaker-space=true $cur_alimdl \
+         "sgmm-sum-accs - $dir/$y.*.aliacc|" $dir/$[$y+1].alimdl || exit 1;
+      rm $dir/$y.*.aliacc || exit 1;
+      [ $y -gt 0 ]  && rm $dir/$y.alimdl
+    fi
+    cur_alimdl=$dir/$[$y+1].alimdl
+    y=$[$y+1]
+  done
+  (cd $dir; rm final.alimdl 2>/dev/null; ln -s $y.alimdl final.alimdl )
+fi
+
+
+# Print out summary of the warning messages.
+for x in $dir/log/*.log; do 
+  n=`grep WARNING $x | wc -l`; 
+  if [ $n -ne 0 ]; then echo $n warnings in $x; fi; 
+done
+
+echo Done
--- a/egs/wsj/s5/steps/train_ubm.sh
+++ b/egs/wsj/s5/steps/train_ubm.sh
+#!/bin/bash
+# Copyright 2012  Daniel Povey.  Apache 2.0.
+
+# This trains a UBM (i.e. a mixture of Gaussians), by clustering
+# the Gaussians from a trained HMM/GMM system and then doing a few
+# iterations of UBM training.
+# We mostly use this for SGMM systems.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+silence_weight=  # You can set it to e.g. 0.0, to weight down silence in training.
+stage=-2
+num_gselect1=50 # first stage of Gaussian-selection
+num_gselect2=25 # second stage.
+intermediate_num_gauss=2000
+num_iters=3
+# End configuration section.
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 5 ]; then
+  echo "Usage: steps/train_ubm.sh <num-gauss> <data> <lang> <ali-dir> <exp>"
+  echo " e.g.: steps/train_ubm.sh 400 data/train_si84 data/lang exp/tri2b_ali_si84 exp/ubm3c"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config containing options"
+  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --silence-weight <sil-weight>                    # weight for silence (e.g. 0.5 or 0.0)"
+  echo "  --num-iters <#iters>                             # Number of iterations of E-M"
+  exit 1;
+fi
+
+num_gauss=$1
+data=$2
+lang=$3
+alidir=$4
+dir=$5
+
+for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl; do
+  [ ! -f $f ] && echo "No such file $f" && exit 1;
+done
+
+if [ $[$num_gauss*2] -gt $intermediate_num_gauss ]; then
+  echo "intermediate_num_gauss was too small $intermediate_num_gauss"
+  intermediate_num_gauss=$[$num_gauss*2];
+  echo "setting it to $intermediate_num_gauss"
+fi
+
+
+# Set various variables.
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+nj=`cat $alidir/num_jobs` || exit 1;
+
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+## Set up features.
+if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "$0: feature type is $feat_type"
+
+case $feat_type in
+  delta) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) sifeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |"
+    cp $alidir/final.mat $dir    
+    ;;
+  *) echo "$0: invalid feature type $feat_type" && exit 1;
+esac
+if [ -f $alidir/1.trans ]; then
+  echo "$0: using transforms from $alidir"
+  feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/JOB.trans ark:- ark:- |"
+fi
+##
+
+if [ ! -z "$silence_weight" ]; then
+  weights_opt="--weights='gunzip -c $alidir/ali.JOB.gz | ali-to-post ark:- ark:- | weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- | post-to-weights ark:- ark:- |'"
+else
+  weights_opt=
+fi
+
+if [ $stage -le -2 ]; then
+  echo "$0: clustering model $alidir/final.mdl to get initial UBM"
+  $cmd $dir/log/cluster.log \
+    init-ubm --intermediate-num-gauss=$intermediate_num_gauss --ubm-num-gauss=$num_gauss \
+    --verbose=2 --fullcov-ubm=true $alidir/final.mdl $alidir/final.occs \
+    $dir/0.ubm   || exit 1;
+fi
+
+# Do initial phase of Gaussian selection and save it to disk -- later on we'll
+# do more Gaussian selection to further prune, as the model changes.
+
+
+if [ $stage -le -1 ]; then
+  echo "$0: doing Gaussian selection"
+  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
+    gmm-gselect --n=$num_gselect1 "fgmm-global-to-gmm $dir/0.ubm - |" "$feats" \
+    "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
+fi
+
+
+x=0
+while [ $x -lt $num_iters ]; do
+  echo "Pass $x"
+  $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+    gmm-gselect --n=$num_gselect2 "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" \
+    "fgmm-global-to-gmm $dir/$x.ubm - |" "$feats" ark:- \| \
+    fgmm-global-acc-stats $weights_opt --gselect=ark,s,cs:- $dir/$x.ubm "$feats" \
+    $dir/$x.JOB.acc || exit 1;
+  lowcount_opt="--remove-low-count-gaussians=false"
+  [ $[$x+1] -eq $num_iters ] && lowcount_opt=   # Only remove low-count Gaussians 
+  # on last iter-- we can't do it earlier, or the Gaussian-selection info would
+  # be mismatched.
+  $cmd $dir/log/update.$x.log \
+    fgmm-global-est $lowcount_opt --verbose=2 $dir/$x.ubm "fgmm-global-sum-accs - $dir/$x.*.acc |" \
+      $dir/$[$x+1].ubm || exit 1;
+  rm $dir/$x.*.acc $dir/$x.ubm
+  x=$[$x+1]
+done
+
+rm $dir/gselect.*.gz
+rm $dir/final.ubm 2>/dev/null
+mv $dir/$x.ubm $dir/final.ubm || exit 1;