A lot of changes to s5 WSJ recipe. Probably does not work correctly right now.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@872 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8

A lot of changes to s5 WSJ recipe. Probably does not work correctly right now.
92d97826 · Dan Povey · 6a20d9a5 · 92d97826 · 92d97826 · 92d97826
Commit 92d97826 authored 12 years ago by Dan Povey
--- a/egs/wsj/s5/local/score.sh
+++ b/egs/wsj/s5/local/score.sh
@@ -8,7 +8,7 @@ cmd=run.pl
 [ $1 == "--cmd" ] && cmd=$2 && shift 2;

 [ $# -ne 3 ] && \
-  echo "Usage: utils/score.sh <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
+  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;

 data=$1
 lang_or_graph=$2
@@ -16,21 +16,11 @@ dir=$3

 symtab=$lang_or_graph/words.txt

-for f in $symtab $dir/lat.1.gz; do
+for f in $symtab $dir/lat.1.gz $data/text; do
  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
 done

-if [ ! -f $symtab ]; then
-  echo No such word symbol table file $symtab
-  exit 1;
-fi
-
 mkdir -p $dir/scoring/log
-# The first phase, independent of how we're going to score, is to get
-# transcription files (one-bests) in .tra format, from the lattices.
-# If we'll be scoring with sclite, then we also need the alignment (.ali)
-# files.
-

 cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt

@@ -38,7 +28,6 @@ $cmd LMWT=9:20 $dir/scoring/log/best_path.LMWT.log \
  lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;

-
 # Note: the double level of quoting for the sed command
 $cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
   cat $dir/scoring/LMWT.tra \| \

--- a/egs/wsj/s5/local/score_mbr.sh
+++ b/egs/wsj/s5/local/score_mbr.sh
+#!/bin/bash
+
+# Script for minimum bayes risk decoding.
+
+[ -f ./path.sh ] && . ./path.sh;
+
+cmd=run.pl
+[ $1 == "--cmd" ] && cmd=$2 && shift 2;
+
+[ $# -ne 3 ] && \
+  echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
+
+
+if [ $# -ne 3 ]; then
+   echo "Usage: scripts/score_mbr.sh <decode-dir> <word-symbol-table> <data-dir>"
+   exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1;
+done
+
+mkdir -p $dir/scoring/log
+
+cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+
+# We submit the jobs separately, not as an array, because it's hard
+# to get the inverse of the LM scales.
+rm $dir/.error
+for inv_acwt in `seq 9 20`; do
+  acwt=`perl -e "print (1.0/$inv_acwt);"`
+  $cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
+    lattice-mbr-decode  --acoustic-scale=$acwt --word-symbol-table=$symtab \
+      "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \
+    || touch $dir/.error &
+done
+wait;
+[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
+     
+
+$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
+   cat $dir/scoring/LMWT.tra \| \
+    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
+    compute-wer --text --mode=present \
+     ark:$dir/scoring/test_filt.txt  ark,p:- ">" $dir/wer_LMWT || exit 1;
+
--- a/egs/wsj/s5/local/wsj_data_prep.sh
+++ b/egs/wsj/s5/local/wsj_data_prep.sh
@@ -135,6 +135,7 @@ done

 #in case we want to limit lm's on most frequent words, copy lm training word frequency list
 cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
+chmod u+w $lmdir/*.lst # had weird permissions on source.

 # The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
 # verbalized pronunciations.   This is the most common test setup, I understand.

--- a/egs/wsj/s5/local/wsj_prepare_dict.sh
+++ b/egs/wsj/s5/local/wsj_prepare_dict.sh
@@ -36,12 +36,24 @@ svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict  \
 # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
 # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.

-
-(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.list
-
-cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' \
- >$dir/nonsilence_phones.list
-
+# silence phones, one per line.
+(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
+
+# nonsilence phones; on each line is a list of phones that correspond
+# really to the same base phone.
+cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+ perl -e 'while(<>){
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; 
+  $phones_of{$1} .= "$_ "; }
+  foreach $list (values %phones_of) {print $list . "\n"; } ' \
+  > $dir/nonsilence_phones.txt || exit 1;
+
+# A few extra questions that will be added to those obtained by automatically clustering
+# the "real" phones.  These ask about stress; there's also one for silence.
+cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+ >> $dir/extra_questions.txt || exit 1;

 grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
 perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \

--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
--- a/egs/wsj/s5/steps/align_deltas.sh
+++ b/egs/wsj/s5/steps/align_deltas.sh
@@ -2,48 +2,44 @@
 # Copyright 2012  Daniel Povey
 # Apache 2.0

-# Computes training alignments using a model with delta features.
+# Computes training alignments using a model with delta or
+# LDA+MLLT features.

 # If you supply the --use-graphs option, it will use the training
 # graphs from the source directory (where the model is).  In this
 # case the number of jobs must match with the source directory.


+# Begin configuration section.  
 nj=4
 cmd=run.pl
-oldgraphs=false
-config=
-for x in `seq 4`; do
-  [ "$1" == --use-graphs ]  && oldgraphs=true && shift;
-  [ "$1" == "--num-jobs" ] && nj=$2 && shift 2;
-  [ "$1" == "--cmd" ] && cmd=$2 && shift 2;
-  [ "$1" == "--config" ] && config=$2 && shift 2;
-done
+use_graphs=false
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+beam=10
+retry_beam=40
+# End configuration options.

+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh

 if [ $# != 4 ]; then
-   echo "usage: steps/align_deltas.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
-   echo "e.g.:  steps/align_deltas.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.:  steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --use-graphs true                                # use graphs in src-dir"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
 fi

-[ -f path.sh ] && . ./path.sh
-
 data=$1
 lang=$2
 srcdir=$3
 dir=$4

-
-# Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-beam=10
-retry_beam=40
-#End configuration.
-[ ! -z $config ] && . $config
-
 oov_sym=`cat $lang/oov.txt`
-
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
 sdata=$data/split$nj
@@ -51,17 +47,28 @@ sdata=$data/split$nj

 cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;

-feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

-echo "align_deltas.sh: aligning data in $data using model from $srcdir, putting alignments in $dir"
+if [ -z $feat_type ]; then
+  if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+  echo "align_si.sh: feature type is $feat_type"
+fi
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
+    cp $srcdir/final.mat $dir    
+   ;;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+echo "align_si.sh: aligning data in $data using model from $srcdir, putting alignments in $dir"

-if $oldgraphs; then 
+if $use_graphs; then 
  [ $nj != "`cat $srcdir/num_jobs`" ] && echo "Mismatch in num-jobs" && exit 1;
  [ ! -f $srcdir/1.fsts.gz ] && echo "no such file $srcdir/1.fsts.gz" && exit 1;

  $cmd JOB=1:$nj $dir/log/align.JOB.log \
    gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/final.mdl \
-      "ark:gunzip -c $srcdir/$n.fsts.gz|" "$feats" "ark:|gzip -c >$dir/$n.ali.gz" || exit 1;
+      "ark:gunzip -c $srcdir/JOB.fsts.gz|" "$feats" "ark:|gzip -c >$dir/JOB.ali.gz" || exit 1;
 else
  tra="ark:utils/sym2int.pl --map-oov \"$oov_sym\" -f 2- $lang/words.txt $sdata/JOB/text|";
  # We could just use gmm-align in the next line, but it's less efficient as it compiles the

--- a/egs/wsj/s5/steps/compute_cmvn_stats.sh
+++ b/egs/wsj/s5/steps/compute_cmvn_stats.sh
@@ -9,7 +9,6 @@
 # We do this in just one job; it's fast.
 # This script takes no options.

-
 if [ $# != 3 ]; then
   echo "usage: compute_cmvn_stats.sh [options] <data-dir> <log-dir> <path-to-cmvn-dir>";
   exit 1;

--- a/egs/wsj/s5/steps/decode_deltas.sh
+++ b/egs/wsj/s5/steps/decode_deltas.sh
@@ -3,23 +3,31 @@
 # Copyright 2012  Daniel Povey
 # Apache 2.0

-[ -f ./path.sh ] && . ./path.sh; # source the path.
-
+# Begin configuration section.  
 nj=4
 cmd=run.pl
-config=
+maxactive=7000
+beam=13.0
+latbeam=6.0
+acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
+# End configuration section.

-for x in `seq 3`; do
-  [ $1 == "--num-jobs" ] && nj=$2 && shift 2;
-  [ $1 == "--cmd" ] && cmd=$2 && shift 2;
-  [ $1 == "--config" ] && config=$2 && shift 2;
-done
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;

 if [ $# != 3 ]; then
-   echo "Usage: steps/decode_deltas.sh <graph-dir> <data-dir> <decode-dir>"
+   echo "Usage: steps/decode_si.sh [options] <graph-dir> <data-dir> <decode-dir>"
   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
   echo " where the model is."
-   echo "e.g.: steps/decode_deltas.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
 fi

@@ -34,20 +42,18 @@ mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
 echo $nj > $dir/num_jobs

-# Begin configuration.
-maxactive=7000
-beam=13.0
-latbeam=6.0
-acwt=0.083333
-# End configuration.
-[ ! -z $config ] && . $config # Override any of the above, if --config specified.
-
 for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst; do
-  [ ! -f $f ] && echo "decode_deltas.sh: no such file $f" && exit 1;
+  [ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1;
 done

+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode_si.sh: feature type is $feat_type";

-feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac

 $cmd JOB=1:$nj $dir/log/decode.JOB.log \
 gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \

--- a/egs/wsj/s5/steps/decode_si_biglm.sh
+++ b/egs/wsj/s5/steps/decode_si_biglm.sh
+#!/bin/bash
+
+# Copyright 2012  Daniel Povey
+# Apache 2.0
+
+# Begin configuration.
+nj=4
+cmd=run.pl
+maxactive=7000
+beam=13.0
+latbeam=6.0
+acwt=0.083333
+# End configuration.
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 5 ]; then
+   echo "Usage: steps/decode_si_biglm.sh [options] <graph-dir> <old-LM-fst> <new-LM-fst> <data-dir> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the model is."
+   echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
+   echo ""
+   echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
+   echo "what type of features you used (assuming it's one of these two)"
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+
+graphdir=$1
+oldlm_fst=$2
+newlm_fst=$3
+data=$4
+dir=$5
+
+srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst $oldlm_fst $newlm_fst; do
+  [ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1;
+done
+
+
+if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
+echo "decode_si.sh: feature type is $feat_type"
+
+case $feat_type in
+  delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
+  lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
+  *) echo "Invalid feature type $feat_type" && exit 1;
+esac
+
+[ -f `dirname $oldlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
+  echo "Warning: old LM words.txt does not match with that in $graphdir .. probably will not work.";
+[ -f `dirname $newlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
+  echo "Warning: new LM words.txt does not match with that in $graphdir .. probably will not work.";
+
+oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |"
+newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |"
+
+$cmd JOB=1:$nj $dir/log/decode.JOB.log \
+ gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \
+   --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+  $srcdir/final.mdl $graphdir/HCLG.fst "$oldlm_cmd" "$newlm_cmd" "$feats" \
+  "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+
+[ ! -x local/score.sh ] && \
+  echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+local/score.sh --cmd "$cmd" $data $graphdir $dir
+
+exit 0;
--- a/egs/wsj/s5/steps/lmrescore.sh
+++ b/egs/wsj/s5/steps/lmrescore.sh
 #!/bin/bash

 mode=4
-cmd=scripts/run.pl
+cmd=run.pl

 for x in `seq 2`; do
  [ "$1" == "--cmd" ] && cmd=$2 && shift 2;
@@ -10,7 +10,8 @@ done

 if [ $# != 5 ]; then
   echo "Do language model rescoring of lattices (remove old LM, add new LM)"
-   echo "Usage: scripts/lmrescore.sh <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "Usage: steps/lmrescore.sh [options] <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--mode (1|2|3|4)]"
   exit 1;
 fi


--- a/egs/wsj/s5/steps/make_mfcc.sh
+++ b/egs/wsj/s5/steps/make_mfcc.sh
@@ -6,36 +6,26 @@
 # see ../run.sh for example

 nj=4
-cmd=utils/run.pl
-config=conf/mfcc.conf
+cmd=run.pl
+mfcc_config=conf/mfcc.conf

-for x in 1 2; do
-  if [ $1 == "--num-jobs" ]; then
-     nj=$2
-     shift 2
-  fi
-  if [ $1 == "--cmd" ]; then
-     cmd=$2
-     shift 2
-  fi  
-  if [ $1 == "--config" ]; then
-     config=$2
-     shift 2
-  fi  
-done
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;

 if [ $# != 3 ]; then
   echo "usage: make_mfcc.sh [options] <data-dir> <log-dir> <path-to-mfccdir>";
-   echo "options: [--config <config-file>] [--num-jobs <num-jobs>] [--cmd utils/run.pl|utils/queue.pl]"
+   echo "options: "
+   echo "  --mfcc-config <config-file>                      # config passed to compute-mfcc-feats "
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
 fi

-if [ -f path.sh ]; then . ./path.sh; fi
-
 data=$1
 logdir=$2
 mfccdir=$3

+
 # make $mfccdir an absolute pathname.
 mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}`

@@ -47,7 +37,7 @@ mkdir -p $logdir || exit 1;

 scp=$data/wav.scp

-required="$scp $config"
+required="$scp $mfcc_config"

 for f in $required; do
  if [ ! -f $f ]; then
@@ -73,7 +63,7 @@ if [ -f $data/segments ]; then

  $cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \
    extract-segments scp:$scp $logdir/segments.JOB ark:- \| \
-    compute-mfcc-feats --verbose=2 --config=$config ark:- \
+    compute-mfcc-feats --verbose=2 --config=$mfcc_config ark:- \
    ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
     || exit 1;

@@ -87,7 +77,7 @@ else
  utils/split_scp.pl $scp $split_scps || exit 1;
 
  $cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \
-    compute-mfcc-feats  --verbose=2 --config=$config scp:$logdir/wav.JOB.scp \
+    compute-mfcc-feats  --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp \
      ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
      || exit 1;


--- a/egs/wsj/s5/steps/train_deltas.sh
+++ b/egs/wsj/s5/steps/train_deltas.sh
@@ -3,25 +3,30 @@
 # Copyright 2012  Daniel Povey
 # Apache 2.0

+# Begin configuration.
 stage=-4 #  This allows restarting after partway, when something when wrong.
-nj=4
 cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 20 30";
+numiters=35    # Number of iterations of training
+maxiterinc=25 # Last iter to increase #Gauss on.
+beam=10
+retry_beam=40
+# End configuration.

-for x in `seq 4`; do
-  [ "$1" == "--num-jobs" ] && nj=$2 && shift 2;
-  [ "$1" == "--cmd" ] && cmd=$2 && shift 2;
-  [ "$1" == "--config" ] && config=$2 && shift 2;
-  [ "$1" == "--stage" ] && stage=$2 && shift 2;
-done
+[ -f path.sh ] && . ./path.sh;
+. parse_options.sh || exit 1;

 if [ $# != 6 ]; then
   echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
   echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
   exit 1;
 fi

-[ -f path.sh ] && . ./path.sh;
-
 numleaves=$1
 totgauss=$2
 data=$3
@@ -29,28 +34,20 @@ lang=$4
 alidir=$5
 dir=$6

-# Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-realign_iters="10 20 30";
-numiters=35    # Number of iterations of training
-maxiterinc=25 # Last iter to increase #Gauss on.
-beam=10
-retry_beam=40
-# End configuration.
-[ ! -z $config ] && . $config # Override any of the above, if --config specified.
-
-[ ! -f $alidir/final.mdl ] && echo "Error: no such file $alidir/final.mdl" && exit 1;
+for f in $alidir/final.mdl $alidir/1.ali.gz $data/feats.scp $lang/phones.txt; do
+  [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
+done

 numgauss=$numleaves
 incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
 oov=`cat $lang/oov.int` || exit 1;
 ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
-
+nj=`cat $alidir/num_jobs` || exit 1;
 mkdir -p $dir/log
 echo $nj > $dir/num_jobs
+
 sdata=$data/split$nj;
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
-[ "$nj" -ne "`cat $alidir/num_jobs`" ] && echo "Number of jobs does not match $alidir" && exit 1;

 feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"

@@ -68,7 +65,7 @@ fi
 if [ $stage -le -2 ]; then
  echo "Getting questions for tree-building, via clustering"
  # preparing questions, roots file...
-  cluster-phones $dir/treeacc $lang/phones/sets_cluster.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
  cat $lang/phones/extra_questions.int >> $dir/questions.int
  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;

@@ -80,9 +77,9 @@ if [ $stage -le -2 ]; then

  gmm-init-model  --write-occs=$dir/1.occs  \
    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";

-  # could mix up if we wanted:
-  # gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
+  gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
  rm $dir/treeacc
 fi

@@ -122,9 +119,7 @@ while [ $x -lt $numiters ]; do
    rm $dir/$x.mdl $dir/$x.*.acc
    rm $dir/$x.occs
  fi
-  if [ $x -le $maxiterinc ]; then
-    numgauss=$[$numgauss+$incgauss];
-  fi
+  [ $x -le $maxiterinc ] && numgauss=$[$numgauss+$incgauss];
  x=$[$x+1];
 done

@@ -134,7 +129,7 @@ ln -s $x.occs $dir/final.occs

 # Summarize warning messages...
 for x in $dir/log/*.log; do 
-  [ `grep WARNING $x | wc -l` -ne 0 ] && echo $n warnings in $x;
+  n=`grep WARNING $x | wc -l`; [ $n -ne 0 ] && echo $n warnings in $x;
 done

-echo Done training system with delta features.
+echo Done training system with delta+delta-delta features in $dir
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
+#!/bin/bash
+
+# Copyright 2012  Daniel Povey
+# Apache 2.0.
+
+# Begin configuration.
+cmd=run.pl
+config=
+stage=-4
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+realign_iters="10 20 30";
+mllt_iters="2 4 6 12";
+numiters=35    # Number of iterations of training
+maxiterinc=25  # Last iter to increase #Gauss on.
+dim=40
+beam=10
+retry_beam=40
+# End configuration.
+
+[ -f path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# != 6 ]; then
+   echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>"
+   echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
+   echo "main options (for others, see top of script file)"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --stage <stage>                                  # stage to do partial re-run from."
+   exit 1;
+fi
+
+numleaves=$1
+totgauss=$2
+data=$3
+lang=$4
+alidir=$5
+dir=$6
+
+for f in $alidir/final.mdl $alidir/1.ali.gz $data/feats.scp $lang/phones.txt; do
+  [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1;
+done
+
+numgauss=$numleaves
+incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
+randprune=4.0 # This is approximately the ratio by which we will speed up the
+              # LDA and MLLT calculations via randomized pruning.
+oov=`cat $lang/oov.int` || exit 1;
+nj=`cat $alidir/num_jobs` || exit 1;
+silphonelist=`cat $lang/phones/silence.csl` || exit 1;
+ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
+
+mkdir -p $dir/log
+echo $nj >$dir/num_jobs
+
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+
+
+splicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- |"
+# Note: $feats gets overwritten later in the script.
+feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
+
+
+
+if [ $stage -le -4 ]; then
+  echo "Accumulating LDA statistics."
+  $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
+    ali-to-post "ark:gunzip -c $alidir/JOB.ali.gz|" ark:- \| \
+      weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
+      acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \
+       $dir/lda.JOB.acc || exit 1;
+  est-lda --dim=$dim $dir/0.mat $dir/lda.*.acc 2>$dir/log/lda_est.log || exit 1;
+  rm $dir/lda.*.acc
+fi
+
+cur_lda_iter=0
+
+if [ $stage -le -3 ]; then
+  echo "Accumulating tree stats"
+  $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
+   acc-tree-stats  --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
+     "ark:gunzip -c $alidir/JOB.ali.gz|" $dir/JOB.treeacc || exit 1;
+  $cmd $dir/log/sum_tree_acc.log \
+    sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
+  rm $dir/*.treeacc
+fi
+
+
+if [ $stage -le -2 ]; then
+  echo "Computing questions for tree clustering"
+  # preparing questions, roots file...
+  cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
+  cat $lang/phones/extra_questions.int >> $dir/questions.int
+  compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
+
+  echo "Building the tree"
+  $cmd $dir/log/build_tree.log \
+    build-tree --verbose=1 --max-leaves=$numleaves \
+     $dir/treeacc $lang/phones/roots.int \
+     $dir/questions.qst $lang/topo $dir/tree || exit 1;
+
+  gmm-init-model  --write-occs=$dir/1.occs  \
+    $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
+  grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
+
+  # could mix up if we wanted:
+  # gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
+  rm $dir/treeacc
+fi
+
+
+if [ $stage -le -1 ]; then
+  # Convert the alignments.
+  echo "Converting alignments from $alidir to use current tree"
+  $cmd JOB=1:$nj $dir/log/convert.JOB.log \
+    convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
+     "ark:gunzip -c $alidir/JOB.ali.gz|" "ark:|gzip -c >$dir/JOB.ali.gz" || exit 1;
+fi
+
+if [ $stage -le 0 ]; then
+  echo "Compiling graphs of transcripts"
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    compile-train-graphs $dir/tree $dir/1.mdl  $lang/L.fst  \
+     "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
+      "ark:|gzip -c >$dir/JOB.fsts.gz" || exit 1;
+fi
+
+
+x=1
+while [ $x -lt $numiters ]; do
+  echo Training pass $x
+  if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
+    echo Aligning data
+    $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
+      gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/$x.mdl \
+      "ark:gunzip -c $dir/JOB.fsts.gz|" "$feats" \
+      "ark:|gzip -c >$dir/JOB.ali.gz" || exit 1;
+  fi
+  if echo $mllt_iters | grep -w $x >/dev/null; then
+    if [ $stage -le $x ]; then
+      echo "Estimating MLLT"
+      $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
+        ali-to-post "ark:gunzip -c $dir/JOB.ali.gz|" ark:- \| \
+        weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
+        gmm-acc-mllt --rand-prune=$randprune  $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
+        || exit 1;
+      est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
+      gmm-transform-means  $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
+        2> $dir/log/transform_means.$x.log || exit 1;
+      compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1;
+      rm $dir/$x.*.macc
+    fi
+    feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |"
+    cur_lda_iter=$x
+  fi
+
+  if [ $stage -le $x ]; then
+    $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
+      gmm-acc-stats-ali  $dir/$x.mdl "$feats" \
+      "ark,s,cs:gunzip -c $dir/JOB.ali.gz|" $dir/$x.JOB.acc || exit 1;
+    $cmd $dir/log/update.$x.log \
+      gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
+       "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
+    rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs 
+  fi
+  [ $x -le $maxiterinc ] && numgauss=$[$numgauss+$incgauss];
+  x=$[$x+1];
+done
+
+rm $dir/final.{mdl,mat,occs} 2>/dev/null
+ln -s $x.mdl $dir/final.mdl
+ln -s $x.occs $dir/final.occs
+ln -s $cur_lda_iter.mat $dir/final.mat
+
+# Summarize warning messages...
+for x in $dir/log/*.log; do 
+  n=`grep WARNING $x | wc -l`; [ $n -ne 0 ] && echo $n warnings in $x;
+done
+
+echo Done training system with LDA+MLLT features in $dir
--- a/egs/wsj/s5/steps/train_mono.sh
+++ b/egs/wsj/s5/steps/train_mono.sh
@@ -7,20 +7,27 @@
 # Flat start and monophone training, with delta-delta features.
 # This script applies cepstral mean normalization (per speaker).

+# Begin configuration section.
 nj=4
 cmd=run.pl
+scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
+numiters=40    # Number of iterations of training
+maxiterinc=30 # Last iter to increase #Gauss on.
+totgauss=1000 # Target #Gaussians.  
+realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
 config= # name of config file.
+# End configuration section.

-for x in `seq 3`; do
-  [ "$1" == "--num-jobs" ] && nj=$2 && shift 2;
-  [ "$1" == "--cmd" ] && cmd=$2 && shift 2;
-  [ "$1" == "--config" ] && config=$2 && shift 2;
-done
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;

 if [ $# != 3 ]; then
   echo "Usage: steps/train_mono.sh [options] <data-dir> <lang-dir> <exp-dir>"
   echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
-   echo "options: [--cmd (run.pl|queue.pl [opts])] [--num-jobs <nj>] [--config <cfg-file>]"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
   exit 1;
 fi

@@ -28,15 +35,6 @@ data=$1
 lang=$2
 dir=$3

-if [ -f path.sh ]; then . ./path.sh; fi
-
-# Begin configuration.
-scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
-numiters=40    # Number of iterations of training
-maxiterinc=30 # Last iter to increase #Gauss on.
-totgauss=1000 # Target #Gaussians.  
-realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
-# End configuration.
 [ ! -z $config ] && . $config # Override any of the above, if --config specified.

 oov_sym=`cat $lang/oov.int` || exit 1;
@@ -52,8 +50,8 @@ example_feats="`echo '$feats' | sed s/JOB/1/g`";

 echo "Initializing monophone system."

-if [ -f $lang/phones/sets_mono.int ]; then
-  shared_phones_opt="--shared-phones=$lang/phones/sets_mono.int"
+if [ -f $lang/phones/sets.int ]; then
+  shared_phones_opt="--shared-phones=$lang/phones/sets.int"
 fi

 # Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway.

--- a/egs/wsj/s5/utils/apply_map.pl
+++ b/egs/wsj/s5/utils/apply_map.pl
+#!/usr/bin/perl -w
+# Copyright 2012  Daniel Povey
+# Apache 2.0.
+
+# This program is a bit like ./sym2int.pl in that it applies a map
+# to things in a file, but it's a bit more general in that it doesn't
+# assume the things being mapped to are single tokens, they could
+# be sequences of tokens.  
+
+# This program takes two arguments, which may be files or "-" for the
+# standard input.  Both files must have lines with one or more fields,
+# interpreted as a map from the first field (a string) to a list of strings.
+# if the first file has as one of its lines
+# A x y
+# and the second has the lines
+# x P
+# y Q R
+# then the output of this program will be
+# A P Q R
+# 
+# Note that if x or y did not appear as the first field of file b, we would
+# print a warning and omit the whole line rather than map it to the empty
+# string.
+
+if(@ARGV != 1) {
+  print STDERR "Usage: apply_map.pl map <input >output\n" .
+    "e.g.: echo A B | apply_map.pl <a.txt\n" .
+    "where a.txt is:\n" .
+    "A a1 a2\n" .
+    "B b\n" .
+    "will produce:\n" .
+    "a1 a2 b\n";
+}
+
+($map) = @ARGV;
+open(M, "<$map") || die "Opening map file $map";
+
+while (<M>) {
+  @A = split(" ", $_);
+  @A >= 1 || die "apply_map.pl: empty line.";
+  $i = shift @A;
+  $o = join(" ", @A);
+  $map{$i} = $o;
+}
+
+while(<STDIN>) {
+  @A = split(" ", $_);
+  for ($x = 0; $x < @A; $x++) {
+    $a = $A[$x];
+    if (!defined $map{$a}) { die "compose_maps.pl: undefined key $a\n"; }
+    $A[$x] = $map{$a};
+  }
+  print join(" ", @A) . "\n";
+}
--- a/egs/wsj/s5/utils/parse_options.sh
+++ b/egs/wsj/s5/utils/parse_options.sh
+#!/bin/bash
+# Copyright Daniel Povey, 2012.  Apache 2.0.
+
+# Parse command-line options-- to be sourced by another script (as in ". parse_options.sh")
+# option format is:
+# --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+
+
+# The following assignment allows the --config variable to be specified
+# in all cases.
+[ -z "$config" ] && config=
+
+while true; do
+ case "$1" in
+        # If the first command-line argument begins with "--" (e.g. --foo-bar), then work out  
+        # the variable name as $name, which will equal "foo_bar".
+  --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
+        # Next we test whether the variable in question is undefned-- if so it's an
+        # invalid option and we die.  Note: $0 evaluates to the name of the enclosing
+        # script.
+        # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+        # is undefined.  We then have to wrap this test inside "eval" because foo_bar is
+        # itself inside a variable ($name).
+        eval '[ -z "${'$name'+xxx}" ] && echo "$0: invalid option $1" && exit 1;'
+        # Set the variable to the right value-- the escaped quotes make it work if
+        # the option had spaces, like --cmd "queue.pl -sync y"
+        eval $name=\"$2\"; shift 2;;
+  *) break;
+ esac
+done
+
+[ ! -z $config ] && . $config # Override any of the options, if --config was specified.
+
+true; # so this script returns code zero.
+
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -6,7 +6,19 @@
 # given a source directory containing a dictionary lexicon.txt in a form like:
 # word phone1 phone2 ... phonen
 # per line (alternate prons would be separate lines).
-# and also files silence_phones.list and nonsilence_phones.list
+# and also files silence_phones.txt and nonsilence_phones.txt, and extra_questions.txt
+# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
+# non-silence phones respectively (where silence includes various kinds of noise,
+# laugh, cough, filled pauses etc., and nonsilence phones includes the "real" phones.)
+# on each line of those files is a list of phones, and the phones on each line are
+# assumed to correspond to the same "base phone", i.e. they will be different stress
+# or tone variations of the same basic phone.
+# extra_questions.txt might be empty; typically will consist of lists of phones, all
+# members of each list with the same stress or tone or something; and also a list for
+# the silence phones.  This will augment the automtically generated questions (note:
+# the automatically generated ones will treat all the stress/tone versions of a phone
+# the same, so will not "get to ask" about stress or tone).
+
 # This script adds word-position-dependent phones and constructs a host of other
 # derived files, that go in data/lang/.

@@ -17,8 +29,8 @@ if [ $# -ne 3 ]; then
 fi


-tmpdir=$2
 srcdir=$1
+tmpdir=$2
 dir=$3
 mkdir -p $dir $tmpdir $dir/phones

@@ -31,28 +43,53 @@ perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die;
    for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
  <$srcdir/lexicon.txt >$tmpdir/lexicon.txt || exit 1;

-for f in $srcdir/{,non}silence_phones.list; do
+for f in $srcdir/{,non}silence_phones.txt $srcdir/extra_questions.txt; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
 done

-echo "<eps>" | \
- cat - <(for x in `cat $srcdir/silence_phones.list`; do for y in "" "_B" "_E" "_I" "_S"; do echo "$x$y"; done; done) | \
- cat - <(for x in `cat $srcdir/nonsilence_phones.list`; do for y in "_B" "_E" "_I" "_S"; do echo "$x$y"; done; done) | \
- awk '{ n=NR-1; print $1, n; }' > $tmpdir/phones_nodisambig.txt 
-
-# Now create lists of silence phones and nonsilence phones; and word-begin and word-end
-# information.
-rm $tmpdir/{,non}silence_phones.list 2>/dev/null
-for x in `grep -v -w '<eps>' $tmpdir/phones_nodisambig.txt | awk '{print $1}'`; do  
-  basephone=`echo $x | sed s/_[BEIS]$//`;
-  if grep -w $basephone <$srcdir/silence_phones.list >/dev/null; then # was silence
-    echo $x >>$tmpdir/silence_phones.list
-  else
-    echo $x >>$tmpdir/nonsilence_phones.list
-  fi
+# create $tmpdir/phone_map.txt
+# this has the format (on each line)
+# <original phone> <version 1 of original phone> <version 2 of original phone> ...
+# where the different versions depend on word position.  For instance, we'd have
+# AA AA_B AA_E AA_I AA_S
+# and in the case of silence
+# SIL SIL SIL_B SIL_E SIL_I SIL_S
+# [because SIL on its own is one of the variants; this is for when it doesn't
+#  occur inside a word but as an option in the lexicon.]
+
+# This phone map expands the phone lists into all the word-position-dependent
+# versions of the phone lists.
+
+cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+  <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+  > $tmpdir/phone_map.txt
+
+mkdir -p $dir/phones # various sets of phones...
+
+# Sets of phones for use in clustering, and making monophone systems.
+cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
+cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
+
+cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+ awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
+cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
+ awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
+cp $dir/phones/silence.txt $dir/phones/context_indep.txt
+
+cat $srcdir/extra_questions.txt | utils/apply_map.pl $tmpdir/phone_map.txt \
+  >$dir/phones/extra_questions.txt
+
+# Want extra questions about the word-start/word-end stuff. Make it separate for
+# silence and non-silence.. probably doesn't really matter, as silence will rarely
+# be inside a word.
+for suffix in _B _E _I _S; do
+ (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+done
+for suffix in "" _B _E _I _S; do
+ (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
 done
-  
-# (0), this is more data-preparation than data-formatting;
+
+
 # add disambig symbols to the lexicon in $tmpdir/lexicon.txt
 # and produce $tmpdir/lexicon_disambig.txt

@@ -67,36 +104,20 @@ echo $ndisambig > $tmpdir/lex_ndisambig
 # <NOISE>	NSN_S
 # !EXCLAMATION-POINT	EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E

+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt

-# Create phones file with disambiguation symbols.
-utils/add_disambig.pl --include-zero $tmpdir/phones_nodisambig.txt \
-  `cat $tmpdir/lex_ndisambig` > $dir/phones.txt
-
-# Create 3 subsets of the phones: silence, nonsilence, and disambig.
-cp $tmpdir/silence_phones.list $dir/phones/silence.txt
-cp $dir/phones/silence.txt $dir/phones/context_indep.txt # context-independent phones.
- # In general the silence phones and the context-independent phones will be the
- # same set (this is specified in the roots.txt file created below).
-cp $tmpdir/nonsilence_phones.list $dir/phones/nonsilence.txt
-grep -E '^#[0-9]+' $dir/phones.txt | awk '{print $1}' > $dir/phones/disambig.txt
-
-# Create these lists of phones in colon-separated integer list form too, 
-# for purposes of being given to programs as command-line options.
-for f in silence nonsilence disambig context_indep; do
-  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
-   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
-done
+# Create phone symbol table.
+echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
+  awk '{n=NR-1; print $1, n;}' > $dir/phones.txt 

 # Create a file that describes the word-boundary information for
 # each phone.  5 categories.
-mkdir -p $dir/phones
-grep -v -w '<eps>' $tmpdir/phones_nodisambig.txt | awk '{print $1;}' | \
+cat $dir/phones/{silence,nonsilence}.txt | \
  awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
-       /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
-       {print $1, "nonword";} ' > $dir/phones/word_boundary.txt
-
-
+      /_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
+      {print $1, "nonword";} ' > $dir/phones/word_boundary.txt

+# Create word symbol table.
 cat $tmpdir/lexicon.txt | awk '{print $1}' | sort | uniq  | \
 awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \
  > $dir/words.txt || exit 1;
@@ -121,59 +142,17 @@ echo "<SPOKEN_NOISE>" > $dir/oov.txt || exit 1;
 cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int # integer version of oov
 # symbol, used in some scripts.

-# (2)
-# Create phonesets_*.txt and extra_questions.txt ...
-# phonesets_mono.txt is sets of phones that are shared when building the monophone system
-# and when asking questions based on an automatic clustering of phones, for the
-# triphone system.  extra_questions.txt is some pre-defined extra questions about
-# position and stress that split apart the categories we created in phonesets.txt.
-# in extra_questions.txt there is also a question about silence phones, since we 
-# don't include them in our automatically generated clustering of phones.
-
-mkdir -p $dir/phones
-
-cat $dir/phones/silence.txt | awk '{printf("%s ", $1);} END{printf "\n";}' \
-  > $dir/phones/sets_mono.txt || exit 1;
-
-cat $dir/phones/nonsilence.txt | \
-  perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_"; 
-     $phone=$1; $stress=$2; $position=$3;
-     if($phone eq $curphone){ print " $phone$stress$position"; }
-  else { if(defined $curphone){ print "\n"; } $curphone=$phone;  print "$phone$stress$position";  }} print "\n"; ' \
- >> $dir/phones/sets_mono.txt || exit 1;
-
-grep -v -w `head -1 $dir/phones/silence.txt` $dir/phones/sets_mono.txt \
-  > $dir/phones/sets_cluster.txt || exit 1;
-
-cat $dir/phones/silence.txt | awk '{printf("%s ", $1);} END{printf "\n";}' \
-  > $dir/phones/extra_questions.txt
-cat $dir/phones/nonsilence.txt | perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_"; 
-     $phone=$1; $stress=$2; $pos=$3;
-     $full_phone ="$1$2$3";
-     $pos2list{$pos} = $pos2list{$pos} .  $full_phone . " ";
-     $stress2list{$stress} = $stress2list{$stress} .  $full_phone . " ";
-   } 
-   foreach $k (keys %pos2list) { print "$pos2list{$k}\n"; } 
-   foreach $k (keys %stress2list) { print "$stress2list{$k}\n"; }  ' \
- >> $dir/phones/extra_questions.txt || exit 1;
-
-
-( # Creating the "roots file" for building the context-dependent systems...
-  # we share the roots across all the versions of each real phone.  We also
-  # share the states of the 3 forms of silence.  "not-shared" here means the
-  # states are distinct p.d.f.'s... normally we would automatically split on
-  # the HMM-state but we're not making silences context dependent.
-  cat $dir/phones/silence.txt | \
-    awk 'BEGIN {printf("not-shared not-split ");} {printf("%s ",$1);} END{printf "\n";}';
-  cat $dir/phones/nonsilence.txt | \
-    perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_"; 
-            $phone=$1; $stress=$2; $position=$3;
-      if($phone eq $curphone){ print " $phone$stress$position"; }
-      else { if(defined $curphone){ print "\n"; } $curphone=$phone; 
-            print "shared split $phone$stress$position";  }} print "\n"; '
- ) > $dir/phones/roots.txt || exit 1;
-
-for x in sets_mono sets_cluster extra_questions disambig; do
+
+
+# Create these lists of phones in colon-separated integer list form too, 
+# for purposes of being given to programs as command-line options.
+for f in silence nonsilence disambig context_indep; do
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
+  utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
+   awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
+done
+
+for x in sets extra_questions; do
  utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
 done

@@ -184,7 +163,6 @@ utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
  > $dir/phones/word_boundary.int || exit 1;


-
 silphonelist=`cat $dir/phones/silence.csl | sed 's/:/ /g'`
 nonsilphonelist=`cat $dir/phones/nonsilence.csl | sed 's/:/ /g'`
 cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \

--- a/egs/wsj/s5/utils/split_scp.pl
+++ b/egs/wsj/s5/utils/split_scp.pl
@@ -171,7 +171,6 @@ if ($utt2spk_file ne "") {  # We have the --utt2spk option...
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scpfn = $OUTPUTS[$scpidx];
        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
-        binmode(F, ":utf8");
        $count = 0;
        if(@{$scparray[$scpidx]} == 0) {
            print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";

--- a/egs/wsj/s5/utils/sym2int.pl
+++ b/egs/wsj/s5/utils/sym2int.pl
@@ -96,5 +96,3 @@ while (<>) {
 }

 exit(0);
-
-