Skip to content
Snippets Groups Projects
Commit 92d97826 authored by Dan Povey's avatar Dan Povey
Browse files

A lot of changes to s5 WSJ recipe. Probably does not work correctly right now.

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@872 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8
parent 6a20d9a5
No related branches found
No related tags found
No related merge requests found
Showing
with 780 additions and 308 deletions
......@@ -8,7 +8,7 @@ cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
[ $# -ne 3 ] && \
echo "Usage: utils/score.sh <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
data=$1
lang_or_graph=$2
......@@ -16,21 +16,11 @@ dir=$3
symtab=$lang_or_graph/words.txt
for f in $symtab $dir/lat.1.gz; do
for f in $symtab $dir/lat.1.gz $data/text; do
[ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
done
if [ ! -f $symtab ]; then
echo No such word symbol table file $symtab
exit 1;
fi
mkdir -p $dir/scoring/log
# The first phase, independent of how we're going to score, is to get
# transcription files (one-bests) in .tra format, from the lattices.
# If we'll be scoring with sclite, then we also need the alignment (.ali)
# files.
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
......@@ -38,7 +28,6 @@ $cmd LMWT=9:20 $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
# Note: the double level of quoting for the sed command
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
#!/bin/bash
# Script for minimum bayes risk decoding.
[ -f ./path.sh ] && . ./path.sh;
cmd=run.pl
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
[ $# -ne 3 ] && \
echo "Usage: local/score_mbr.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" && exit 1;
if [ $# -ne 3 ]; then
echo "Usage: scripts/score_mbr.sh <decode-dir> <word-symbol-table> <data-dir>"
exit 1;
fi
data=$1
lang_or_graph=$2
dir=$3
symtab=$lang_or_graph/words.txt
for f in $symtab $dir/lat.1.gz $data/text; do
[ ! -f $f ] && echo "score_mbr.sh: no such file $f" && exit 1;
done
mkdir -p $dir/scoring/log
cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
# We submit the jobs separately, not as an array, because it's hard
# to get the inverse of the LM scales.
rm $dir/.error
for inv_acwt in `seq 9 20`; do
acwt=`perl -e "print (1.0/$inv_acwt);"`
$cmd $dir/scoring/rescore_mbr.${inv_acwt}.log \
lattice-mbr-decode --acoustic-scale=$acwt --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/${inv_acwt}.tra \
|| touch $dir/.error &
done
wait;
[ -f $dir/.error ] && echo "score_mbr.sh: errror getting MBR outout.";
$cmd LMWT=9:20 $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
ark:$dir/scoring/test_filt.txt ark,p:- ">" $dir/wer_LMWT || exit 1;
......@@ -135,6 +135,7 @@ done
#in case we want to limit lm's on most frequent words, copy lm training word frequency list
cp links/13-32.1/wsj1/doc/lng_modl/vocab/wfl_64.lst $lmdir
chmod u+w $lmdir/*.lst # had weird permissions on source.
# The 20K vocab, open-vocabulary language model (i.e. the one with UNK), without
# verbalized pronunciations. This is the most common test setup, I understand.
......
......@@ -36,12 +36,24 @@ svn co https://cmusphinx.svn.sourceforge.net/svnroot/cmusphinx/trunk/cmudict \
# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.list
cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' \
>$dir/nonsilence_phones.list
# silence phones, one per line.
(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
# nonsilence phones; on each line is a list of phones that correspond
# really to the same base phone.
cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
perl -e 'while(<>){
chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
$phones_of{$1} .= "$_ "; }
foreach $list (values %phones_of) {print $list . "\n"; } ' \
> $dir/nonsilence_phones.txt || exit 1;
# A few extra questions that will be added to those obtained by automatically clustering
# the "real" phones. These ask about stress; there's also one for silence.
cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
$p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
>> $dir/extra_questions.txt || exit 1;
grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
......
This diff is collapsed.
......@@ -2,48 +2,44 @@
# Copyright 2012 Daniel Povey
# Apache 2.0
# Computes training alignments using a model with delta features.
# Computes training alignments using a model with delta or
# LDA+MLLT features.
# If you supply the --use-graphs option, it will use the training
# graphs from the source directory (where the model is). In this
# case the number of jobs must match with the source directory.
# Begin configuration section.
nj=4
cmd=run.pl
oldgraphs=false
config=
for x in `seq 4`; do
[ "$1" == --use-graphs ] && oldgraphs=true && shift;
[ "$1" == "--num-jobs" ] && nj=$2 && shift 2;
[ "$1" == "--cmd" ] && cmd=$2 && shift 2;
[ "$1" == "--config" ] && config=$2 && shift 2;
done
use_graphs=false
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
# End configuration options.
[ -f path.sh ] && . ./path.sh # source the path.
. parse_options.sh
if [ $# != 4 ]; then
echo "usage: steps/align_deltas.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
echo "e.g.: steps/align_deltas.sh data/train data/lang exp/tri1 exp/tri1_ali"
echo "usage: steps/align_si.sh <data-dir> <lang-dir> <src-dir> <align-dir>"
echo "e.g.: steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --use-graphs true # use graphs in src-dir"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
[ -f path.sh ] && . ./path.sh
data=$1
lang=$2
srcdir=$3
dir=$4
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
beam=10
retry_beam=40
#End configuration.
[ ! -z $config ] && . $config
oov_sym=`cat $lang/oov.txt`
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj
......@@ -51,17 +47,28 @@ sdata=$data/split$nj
cp $srcdir/{tree,final.mdl,final.occs} $dir || exit 1;
feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
echo "align_deltas.sh: aligning data in $data using model from $srcdir, putting alignments in $dir"
if [ -z $feat_type ]; then
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "align_si.sh: feature type is $feat_type"
fi
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |"
cp $srcdir/final.mat $dir
;;
*) echo "Invalid feature type $feat_type" && exit 1;
esac
echo "align_si.sh: aligning data in $data using model from $srcdir, putting alignments in $dir"
if $oldgraphs; then
if $use_graphs; then
[ $nj != "`cat $srcdir/num_jobs`" ] && echo "Mismatch in num-jobs" && exit 1;
[ ! -f $srcdir/1.fsts.gz ] && echo "no such file $srcdir/1.fsts.gz" && exit 1;
$cmd JOB=1:$nj $dir/log/align.JOB.log \
gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/final.mdl \
"ark:gunzip -c $srcdir/$n.fsts.gz|" "$feats" "ark:|gzip -c >$dir/$n.ali.gz" || exit 1;
"ark:gunzip -c $srcdir/JOB.fsts.gz|" "$feats" "ark:|gzip -c >$dir/JOB.ali.gz" || exit 1;
else
tra="ark:utils/sym2int.pl --map-oov \"$oov_sym\" -f 2- $lang/words.txt $sdata/JOB/text|";
# We could just use gmm-align in the next line, but it's less efficient as it compiles the
......
......@@ -9,7 +9,6 @@
# We do this in just one job; it's fast.
# This script takes no options.
if [ $# != 3 ]; then
echo "usage: compute_cmvn_stats.sh [options] <data-dir> <log-dir> <path-to-cmvn-dir>";
exit 1;
......
......@@ -3,23 +3,31 @@
# Copyright 2012 Daniel Povey
# Apache 2.0
[ -f ./path.sh ] && . ./path.sh; # source the path.
# Begin configuration section.
nj=4
cmd=run.pl
config=
maxactive=7000
beam=13.0
latbeam=6.0
acwt=0.083333 # note: only really affects pruning (scoring is on lattices).
# End configuration section.
for x in `seq 3`; do
[ $1 == "--num-jobs" ] && nj=$2 && shift 2;
[ $1 == "--cmd" ] && cmd=$2 && shift 2;
[ $1 == "--config" ] && config=$2 && shift 2;
done
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: steps/decode_deltas.sh <graph-dir> <data-dir> <decode-dir>"
echo "Usage: steps/decode_si.sh [options] <graph-dir> <data-dir> <decode-dir>"
echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
echo " where the model is."
echo "e.g.: steps/decode_deltas.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
echo ""
echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
echo "what type of features you used (assuming it's one of these two)"
echo ""
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
......@@ -34,20 +42,18 @@ mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
# Begin configuration.
maxactive=7000
beam=13.0
latbeam=6.0
acwt=0.083333
# End configuration.
[ ! -z $config ] && . $config # Override any of the above, if --config specified.
for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst; do
[ ! -f $f ] && echo "decode_deltas.sh: no such file $f" && exit 1;
[ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1;
done
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode_si.sh: feature type is $feat_type";
feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
*) echo "Invalid feature type $feat_type" && exit 1;
esac
$cmd JOB=1:$nj $dir/log/decode.JOB.log \
gmm-latgen-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \
......
#!/bin/bash
# Copyright 2012 Daniel Povey
# Apache 2.0
# Begin configuration.
nj=4
cmd=run.pl
maxactive=7000
beam=13.0
latbeam=6.0
acwt=0.083333
# End configuration.
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;
if [ $# != 5 ]; then
echo "Usage: steps/decode_si_biglm.sh [options] <graph-dir> <old-LM-fst> <new-LM-fst> <data-dir> <decode-dir>"
echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
echo " where the model is."
echo "e.g.: steps/decode_si.sh exp/mono/graph_tgpr data/test_dev93 exp/mono/decode_dev93_tgpr"
echo ""
echo "This script works on CMN + (delta+delta-delta | LDA+MLLT) features; it works out"
echo "what type of features you used (assuming it's one of these two)"
echo ""
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
graphdir=$1
oldlm_fst=$2
newlm_fst=$3
data=$4
dir=$5
srcdir=`dirname $dir`; # The model directory is one level up from decoding directory.
sdata=$data/split$nj;
mkdir -p $dir/log
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo $nj > $dir/num_jobs
for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $srcdir/final.mdl $graphdir/HCLG.fst $oldlm_fst $newlm_fst; do
[ ! -f $f ] && echo "decode_si.sh: no such file $f" && exit 1;
done
if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi
echo "decode_si.sh: feature type is $feat_type"
case $feat_type in
delta) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |";;
lda) feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";;
*) echo "Invalid feature type $feat_type" && exit 1;
esac
[ -f `dirname $oldlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
echo "Warning: old LM words.txt does not match with that in $graphdir .. probably will not work.";
[ -f `dirname $newlm_fst`/words.txt ] && ! cmp `dirname $oldlm_fst`/words.txt $graphdir/words.txt && \
echo "Warning: new LM words.txt does not match with that in $graphdir .. probably will not work.";
oldlm_cmd="fstproject --project_output=true $oldlm_fst | fstarcsort --sort_type=ilabel |"
newlm_cmd="fstproject --project_output=true $newlm_fst | fstarcsort --sort_type=ilabel |"
$cmd JOB=1:$nj $dir/log/decode.JOB.log \
gmm-latgen-biglm-faster --max-active=$maxactive --beam=$beam --lattice-beam=$latbeam \
--acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
$srcdir/final.mdl $graphdir/HCLG.fst "$oldlm_cmd" "$newlm_cmd" "$feats" \
"ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
[ ! -x local/score.sh ] && \
echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
local/score.sh --cmd "$cmd" $data $graphdir $dir
exit 0;
#!/bin/bash
mode=4
cmd=scripts/run.pl
cmd=run.pl
for x in `seq 2`; do
[ "$1" == "--cmd" ] && cmd=$2 && shift 2;
......@@ -10,7 +10,8 @@ done
if [ $# != 5 ]; then
echo "Do language model rescoring of lattices (remove old LM, add new LM)"
echo "Usage: scripts/lmrescore.sh <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
echo "Usage: steps/lmrescore.sh [options] <old-lang-dir> <new-lang-dir> <data-dir> <input-decode-dir> <output-decode-dir>"
echo "options: [--cmd (run.pl|queue.pl [queue opts])] [--mode (1|2|3|4)]"
exit 1;
fi
......
......@@ -6,36 +6,26 @@
# see ../run.sh for example
nj=4
cmd=utils/run.pl
config=conf/mfcc.conf
cmd=run.pl
mfcc_config=conf/mfcc.conf
for x in 1 2; do
if [ $1 == "--num-jobs" ]; then
nj=$2
shift 2
fi
if [ $1 == "--cmd" ]; then
cmd=$2
shift 2
fi
if [ $1 == "--config" ]; then
config=$2
shift 2
fi
done
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "usage: make_mfcc.sh [options] <data-dir> <log-dir> <path-to-mfccdir>";
echo "options: [--config <config-file>] [--num-jobs <num-jobs>] [--cmd utils/run.pl|utils/queue.pl]"
echo "options: "
echo " --mfcc-config <config-file> # config passed to compute-mfcc-feats "
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
if [ -f path.sh ]; then . ./path.sh; fi
data=$1
logdir=$2
mfccdir=$3
# make $mfccdir an absolute pathname.
mfccdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $mfccdir ${PWD}`
......@@ -47,7 +37,7 @@ mkdir -p $logdir || exit 1;
scp=$data/wav.scp
required="$scp $config"
required="$scp $mfcc_config"
for f in $required; do
if [ ! -f $f ]; then
......@@ -73,7 +63,7 @@ if [ -f $data/segments ]; then
$cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \
extract-segments scp:$scp $logdir/segments.JOB ark:- \| \
compute-mfcc-feats --verbose=2 --config=$config ark:- \
compute-mfcc-feats --verbose=2 --config=$mfcc_config ark:- \
ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
|| exit 1;
......@@ -87,7 +77,7 @@ else
utils/split_scp.pl $scp $split_scps || exit 1;
$cmd JOB=1:$nj $logdir/make_mfcc.JOB.log \
compute-mfcc-feats --verbose=2 --config=$config scp:$logdir/wav.JOB.scp \
compute-mfcc-feats --verbose=2 --config=$mfcc_config scp:$logdir/wav.JOB.scp \
ark,scp:$mfccdir/raw_mfcc_$name.JOB.ark,$mfccdir/raw_mfcc_$name.JOB.scp \
|| exit 1;
......
......@@ -3,25 +3,30 @@
# Copyright 2012 Daniel Povey
# Apache 2.0
# Begin configuration.
stage=-4 # This allows restarting after partway, when something when wrong.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
numiters=35 # Number of iterations of training
maxiterinc=25 # Last iter to increase #Gauss on.
beam=10
retry_beam=40
# End configuration.
for x in `seq 4`; do
[ "$1" == "--num-jobs" ] && nj=$2 && shift 2;
[ "$1" == "--cmd" ] && cmd=$2 && shift 2;
[ "$1" == "--config" ] && config=$2 && shift 2;
[ "$1" == "--stage" ] && stage=$2 && shift 2;
done
[ -f path.sh ] && . ./path.sh;
. parse_options.sh || exit 1;
if [ $# != 6 ]; then
echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>"
echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
exit 1;
fi
[ -f path.sh ] && . ./path.sh;
numleaves=$1
totgauss=$2
data=$3
......@@ -29,28 +34,20 @@ lang=$4
alidir=$5
dir=$6
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
numiters=35 # Number of iterations of training
maxiterinc=25 # Last iter to increase #Gauss on.
beam=10
retry_beam=40
# End configuration.
[ ! -z $config ] && . $config # Override any of the above, if --config specified.
[ ! -f $alidir/final.mdl ] && echo "Error: no such file $alidir/final.mdl" && exit 1;
for f in $alidir/final.mdl $alidir/1.ali.gz $data/feats.scp $lang/phones.txt; do
[ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1;
done
numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
oov=`cat $lang/oov.int` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
mkdir -p $dir/log
echo $nj > $dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
[ "$nj" -ne "`cat $alidir/num_jobs`" ] && echo "Number of jobs does not match $alidir" && exit 1;
feats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas ark:- ark:- |"
......@@ -68,7 +65,7 @@ fi
if [ $stage -le -2 ]; then
echo "Getting questions for tree-building, via clustering"
# preparing questions, roots file...
cluster-phones $dir/treeacc $lang/phones/sets_cluster.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
cat $lang/phones/extra_questions.int >> $dir/questions.int
compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
......@@ -80,9 +77,9 @@ if [ $stage -le -2 ]; then
gmm-init-model --write-occs=$dir/1.occs \
$dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
# could mix up if we wanted:
# gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
rm $dir/treeacc
fi
......@@ -122,9 +119,7 @@ while [ $x -lt $numiters ]; do
rm $dir/$x.mdl $dir/$x.*.acc
rm $dir/$x.occs
fi
if [ $x -le $maxiterinc ]; then
numgauss=$[$numgauss+$incgauss];
fi
[ $x -le $maxiterinc ] && numgauss=$[$numgauss+$incgauss];
x=$[$x+1];
done
......@@ -134,7 +129,7 @@ ln -s $x.occs $dir/final.occs
# Summarize warning messages...
for x in $dir/log/*.log; do
[ `grep WARNING $x | wc -l` -ne 0 ] && echo $n warnings in $x;
n=`grep WARNING $x | wc -l`; [ $n -ne 0 ] && echo $n warnings in $x;
done
echo Done training system with delta features.
echo Done training system with delta+delta-delta features in $dir
#!/bin/bash
# Copyright 2012 Daniel Povey
# Apache 2.0.
# Begin configuration.
cmd=run.pl
config=
stage=-4
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
realign_iters="10 20 30";
mllt_iters="2 4 6 12";
numiters=35 # Number of iterations of training
maxiterinc=25 # Last iter to increase #Gauss on.
dim=40
beam=10
retry_beam=40
# End configuration.
[ -f path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# != 6 ]; then
echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>"
echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --config <config-file> # config containing options"
echo " --stage <stage> # stage to do partial re-run from."
exit 1;
fi
numleaves=$1
totgauss=$2
data=$3
lang=$4
alidir=$5
dir=$6
for f in $alidir/final.mdl $alidir/1.ali.gz $data/feats.scp $lang/phones.txt; do
[ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1;
done
numgauss=$numleaves
incgauss=$[($totgauss-$numgauss)/$maxiterinc] # per-iter increment for #Gauss
randprune=4.0 # This is approximately the ratio by which we will speed up the
# LDA and MLLT calculations via randomized pruning.
oov=`cat $lang/oov.int` || exit 1;
nj=`cat $alidir/num_jobs` || exit 1;
silphonelist=`cat $lang/phones/silence.csl` || exit 1;
ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1;
mkdir -p $dir/log
echo $nj >$dir/num_jobs
sdata=$data/split$nj;
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
splicedfeats="ark,s,cs:apply-cmvn --norm-vars=false --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats ark:- ark:- |"
# Note: $feats gets overwritten later in the script.
feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |"
if [ $stage -le -4 ]; then
echo "Accumulating LDA statistics."
$cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \
ali-to-post "ark:gunzip -c $alidir/JOB.ali.gz|" ark:- \| \
weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \
acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \
$dir/lda.JOB.acc || exit 1;
est-lda --dim=$dim $dir/0.mat $dir/lda.*.acc 2>$dir/log/lda_est.log || exit 1;
rm $dir/lda.*.acc
fi
cur_lda_iter=0
if [ $stage -le -3 ]; then
echo "Accumulating tree stats"
$cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \
acc-tree-stats --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \
"ark:gunzip -c $alidir/JOB.ali.gz|" $dir/JOB.treeacc || exit 1;
$cmd $dir/log/sum_tree_acc.log \
sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1;
rm $dir/*.treeacc
fi
if [ $stage -le -2 ]; then
echo "Computing questions for tree clustering"
# preparing questions, roots file...
cluster-phones $dir/treeacc $lang/phones/sets.int $dir/questions.int 2> $dir/log/questions.log || exit 1;
cat $lang/phones/extra_questions.int >> $dir/questions.int
compile-questions $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1;
echo "Building the tree"
$cmd $dir/log/build_tree.log \
build-tree --verbose=1 --max-leaves=$numleaves \
$dir/treeacc $lang/phones/roots.int \
$dir/questions.qst $lang/topo $dir/tree || exit 1;
gmm-init-model --write-occs=$dir/1.occs \
$dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1;
grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning.";
# could mix up if we wanted:
# gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1;
rm $dir/treeacc
fi
if [ $stage -le -1 ]; then
# Convert the alignments.
echo "Converting alignments from $alidir to use current tree"
$cmd JOB=1:$nj $dir/log/convert.JOB.log \
convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \
"ark:gunzip -c $alidir/JOB.ali.gz|" "ark:|gzip -c >$dir/JOB.ali.gz" || exit 1;
fi
if [ $stage -le 0 ]; then
echo "Compiling graphs of transcripts"
$cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
compile-train-graphs $dir/tree $dir/1.mdl $lang/L.fst \
"ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \
"ark:|gzip -c >$dir/JOB.fsts.gz" || exit 1;
fi
x=1
while [ $x -lt $numiters ]; do
echo Training pass $x
if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then
echo Aligning data
$cmd JOB=1:$nj $dir/log/align.$x.JOB.log \
gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam $dir/$x.mdl \
"ark:gunzip -c $dir/JOB.fsts.gz|" "$feats" \
"ark:|gzip -c >$dir/JOB.ali.gz" || exit 1;
fi
if echo $mllt_iters | grep -w $x >/dev/null; then
if [ $stage -le $x ]; then
echo "Estimating MLLT"
$cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \
ali-to-post "ark:gunzip -c $dir/JOB.ali.gz|" ark:- \| \
weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \
gmm-acc-mllt --rand-prune=$randprune $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \
|| exit 1;
est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1;
gmm-transform-means $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \
2> $dir/log/transform_means.$x.log || exit 1;
compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1;
rm $dir/$x.*.macc
fi
feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |"
cur_lda_iter=$x
fi
if [ $stage -le $x ]; then
$cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \
gmm-acc-stats-ali $dir/$x.mdl "$feats" \
"ark,s,cs:gunzip -c $dir/JOB.ali.gz|" $dir/$x.JOB.acc || exit 1;
$cmd $dir/log/update.$x.log \
gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \
"gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1;
rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs
fi
[ $x -le $maxiterinc ] && numgauss=$[$numgauss+$incgauss];
x=$[$x+1];
done
rm $dir/final.{mdl,mat,occs} 2>/dev/null
ln -s $x.mdl $dir/final.mdl
ln -s $x.occs $dir/final.occs
ln -s $cur_lda_iter.mat $dir/final.mat
# Summarize warning messages...
for x in $dir/log/*.log; do
n=`grep WARNING $x | wc -l`; [ $n -ne 0 ] && echo $n warnings in $x;
done
echo Done training system with LDA+MLLT features in $dir
......@@ -7,20 +7,27 @@
# Flat start and monophone training, with delta-delta features.
# This script applies cepstral mean normalization (per speaker).
# Begin configuration section.
nj=4
cmd=run.pl
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=40 # Number of iterations of training
maxiterinc=30 # Last iter to increase #Gauss on.
totgauss=1000 # Target #Gaussians.
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
config= # name of config file.
# End configuration section.
for x in `seq 3`; do
[ "$1" == "--num-jobs" ] && nj=$2 && shift 2;
[ "$1" == "--cmd" ] && cmd=$2 && shift 2;
[ "$1" == "--config" ] && config=$2 && shift 2;
done
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: steps/train_mono.sh [options] <data-dir> <lang-dir> <exp-dir>"
echo " e.g.: steps/train_mono.sh data/train.1k data/lang exp/mono"
echo "options: [--cmd (run.pl|queue.pl [opts])] [--num-jobs <nj>] [--config <cfg-file>]"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --nj <nj> # number of parallel jobs"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
exit 1;
fi
......@@ -28,15 +35,6 @@ data=$1
lang=$2
dir=$3
if [ -f path.sh ]; then . ./path.sh; fi
# Begin configuration.
scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1"
numiters=40 # Number of iterations of training
maxiterinc=30 # Last iter to increase #Gauss on.
totgauss=1000 # Target #Gaussians.
realign_iters="1 2 3 4 5 6 7 8 9 10 12 14 16 18 20 23 26 29 32 35 38";
# End configuration.
[ ! -z $config ] && . $config # Override any of the above, if --config specified.
oov_sym=`cat $lang/oov.int` || exit 1;
......@@ -52,8 +50,8 @@ example_feats="`echo '$feats' | sed s/JOB/1/g`";
echo "Initializing monophone system."
if [ -f $lang/phones/sets_mono.int ]; then
shared_phones_opt="--shared-phones=$lang/phones/sets_mono.int"
if [ -f $lang/phones/sets.int ]; then
shared_phones_opt="--shared-phones=$lang/phones/sets.int"
fi
# Note: JOB=1 just uses the 1st part of the features-- we only need a subset anyway.
......
#!/usr/bin/perl -w
# Copyright 2012 Daniel Povey
# Apache 2.0.
# This program is a bit like ./sym2int.pl in that it applies a map
# to things in a file, but it's a bit more general in that it doesn't
# assume the things being mapped to are single tokens, they could
# be sequences of tokens.
# This program takes two arguments, which may be files or "-" for the
# standard input. Both files must have lines with one or more fields,
# interpreted as a map from the first field (a string) to a list of strings.
# if the first file has as one of its lines
# A x y
# and the second has the lines
# x P
# y Q R
# then the output of this program will be
# A P Q R
#
# Note that if x or y did not appear as the first field of file b, we would
# print a warning and omit the whole line rather than map it to the empty
# string.
if(@ARGV != 1) {
print STDERR "Usage: apply_map.pl map <input >output\n" .
"e.g.: echo A B | apply_map.pl <a.txt\n" .
"where a.txt is:\n" .
"A a1 a2\n" .
"B b\n" .
"will produce:\n" .
"a1 a2 b\n";
}
($map) = @ARGV;
open(M, "<$map") || die "Opening map file $map";
while (<M>) {
@A = split(" ", $_);
@A >= 1 || die "apply_map.pl: empty line.";
$i = shift @A;
$o = join(" ", @A);
$map{$i} = $o;
}
while(<STDIN>) {
@A = split(" ", $_);
for ($x = 0; $x < @A; $x++) {
$a = $A[$x];
if (!defined $map{$a}) { die "compose_maps.pl: undefined key $a\n"; }
$A[$x] = $map{$a};
}
print join(" ", @A) . "\n";
}
#!/bin/bash
# Copyright Daniel Povey, 2012. Apache 2.0.
# Parse command-line options-- to be sourced by another script (as in ". parse_options.sh")
# option format is:
# --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The following assignment allows the --config variable to be specified
# in all cases.
[ -z "$config" ] && config=
while true; do
case "$1" in
# If the first command-line argument begins with "--" (e.g. --foo-bar), then work out
# the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's an
# invalid option and we die. Note: $0 evaluates to the name of the enclosing
# script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because foo_bar is
# itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ] && echo "$0: invalid option $1" && exit 1;'
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\"; shift 2;;
*) break;
esac
done
[ ! -z $config ] && . $config # Override any of the options, if --config was specified.
true; # so this script returns code zero.
......@@ -6,7 +6,19 @@
# given a source directory containing a dictionary lexicon.txt in a form like:
# word phone1 phone2 ... phonen
# per line (alternate prons would be separate lines).
# and also files silence_phones.list and nonsilence_phones.list
# and also files silence_phones.txt and nonsilence_phones.txt, and extra_questions.txt
# Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
# non-silence phones respectively (where silence includes various kinds of noise,
# laugh, cough, filled pauses etc., and nonsilence phones includes the "real" phones.)
# on each line of those files is a list of phones, and the phones on each line are
# assumed to correspond to the same "base phone", i.e. they will be different stress
# or tone variations of the same basic phone.
# extra_questions.txt might be empty; typically will consist of lists of phones, all
# members of each list with the same stress or tone or something; and also a list for
# the silence phones. This will augment the automtically generated questions (note:
# the automatically generated ones will treat all the stress/tone versions of a phone
# the same, so will not "get to ask" about stress or tone).
# This script adds word-position-dependent phones and constructs a host of other
# derived files, that go in data/lang/.
......@@ -17,8 +29,8 @@ if [ $# -ne 3 ]; then
fi
tmpdir=$2
srcdir=$1
tmpdir=$2
dir=$3
mkdir -p $dir $tmpdir $dir/phones
......@@ -31,28 +43,53 @@ perl -ane '@A=split(" ",$_); $w = shift @A; @A>0||die;
for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
<$srcdir/lexicon.txt >$tmpdir/lexicon.txt || exit 1;
for f in $srcdir/{,non}silence_phones.list; do
for f in $srcdir/{,non}silence_phones.txt $srcdir/extra_questions.txt; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
echo "<eps>" | \
cat - <(for x in `cat $srcdir/silence_phones.list`; do for y in "" "_B" "_E" "_I" "_S"; do echo "$x$y"; done; done) | \
cat - <(for x in `cat $srcdir/nonsilence_phones.list`; do for y in "_B" "_E" "_I" "_S"; do echo "$x$y"; done; done) | \
awk '{ n=NR-1; print $1, n; }' > $tmpdir/phones_nodisambig.txt
# Now create lists of silence phones and nonsilence phones; and word-begin and word-end
# information.
rm $tmpdir/{,non}silence_phones.list 2>/dev/null
for x in `grep -v -w '<eps>' $tmpdir/phones_nodisambig.txt | awk '{print $1}'`; do
basephone=`echo $x | sed s/_[BEIS]$//`;
if grep -w $basephone <$srcdir/silence_phones.list >/dev/null; then # was silence
echo $x >>$tmpdir/silence_phones.list
else
echo $x >>$tmpdir/nonsilence_phones.list
fi
# create $tmpdir/phone_map.txt
# this has the format (on each line)
# <original phone> <version 1 of original phone> <version 2 of original phone> ...
# where the different versions depend on word position. For instance, we'd have
# AA AA_B AA_E AA_I AA_S
# and in the case of silence
# SIL SIL SIL_B SIL_E SIL_I SIL_S
# [because SIL on its own is one of the variants; this is for when it doesn't
# occur inside a word but as an option in the lexicon.]
# This phone map expands the phone lists into all the word-position-dependent
# versions of the phone lists.
cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
<(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
> $tmpdir/phone_map.txt
mkdir -p $dir/phones # various sets of phones...
# Sets of phones for use in clustering, and making monophone systems.
cat $srcdir/{,non}silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt > $dir/phones/sets.txt
cat $dir/phones/sets.txt | awk '{print "shared", "split", $0;}' > $dir/phones/roots.txt
cat $srcdir/silence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/silence.txt
cat $srcdir/nonsilence_phones.txt | utils/apply_map.pl $tmpdir/phone_map.txt | \
awk '{for(n=1;n<=NF;n++) print $n;}' > $dir/phones/nonsilence.txt
cp $dir/phones/silence.txt $dir/phones/context_indep.txt
cat $srcdir/extra_questions.txt | utils/apply_map.pl $tmpdir/phone_map.txt \
>$dir/phones/extra_questions.txt
# Want extra questions about the word-start/word-end stuff. Make it separate for
# silence and non-silence.. probably doesn't really matter, as silence will rarely
# be inside a word.
for suffix in _B _E _I _S; do
(for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
done
for suffix in "" _B _E _I _S; do
(for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
done
# (0), this is more data-preparation than data-formatting;
# add disambig symbols to the lexicon in $tmpdir/lexicon.txt
# and produce $tmpdir/lexicon_disambig.txt
......@@ -67,36 +104,20 @@ echo $ndisambig > $tmpdir/lex_ndisambig
# <NOISE> NSN_S
# !EXCLAMATION-POINT EH2_B K_I S_I K_I L_I AH0_I M_I EY1_I SH_I AH0_I N_I P_I OY2_I N_I T_E
( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) >$dir/phones/disambig.txt
# Create phones file with disambiguation symbols.
utils/add_disambig.pl --include-zero $tmpdir/phones_nodisambig.txt \
`cat $tmpdir/lex_ndisambig` > $dir/phones.txt
# Create 3 subsets of the phones: silence, nonsilence, and disambig.
cp $tmpdir/silence_phones.list $dir/phones/silence.txt
cp $dir/phones/silence.txt $dir/phones/context_indep.txt # context-independent phones.
# In general the silence phones and the context-independent phones will be the
# same set (this is specified in the roots.txt file created below).
cp $tmpdir/nonsilence_phones.list $dir/phones/nonsilence.txt
grep -E '^#[0-9]+' $dir/phones.txt | awk '{print $1}' > $dir/phones/disambig.txt
# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence disambig context_indep; do
utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done
# Create phone symbol table.
echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
# Create a file that describes the word-boundary information for
# each phone. 5 categories.
mkdir -p $dir/phones
grep -v -w '<eps>' $tmpdir/phones_nodisambig.txt | awk '{print $1;}' | \
cat $dir/phones/{silence,nonsilence}.txt | \
awk '/_I$/{print $1, "internal"; next;} /_B$/{print $1, "begin"; next; }
/_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
{print $1, "nonword";} ' > $dir/phones/word_boundary.txt
/_S$/{print $1, "singleton"; next;} /_E$/{print $1, "end"; next; }
{print $1, "nonword";} ' > $dir/phones/word_boundary.txt
# Create word symbol table.
cat $tmpdir/lexicon.txt | awk '{print $1}' | sort | uniq | \
awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);} END{printf("#0 %d\n", NR+1);} ' \
> $dir/words.txt || exit 1;
......@@ -121,59 +142,17 @@ echo "<SPOKEN_NOISE>" > $dir/oov.txt || exit 1;
cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int # integer version of oov
# symbol, used in some scripts.
# (2)
# Create phonesets_*.txt and extra_questions.txt ...
# phonesets_mono.txt is sets of phones that are shared when building the monophone system
# and when asking questions based on an automatic clustering of phones, for the
# triphone system. extra_questions.txt is some pre-defined extra questions about
# position and stress that split apart the categories we created in phonesets.txt.
# in extra_questions.txt there is also a question about silence phones, since we
# don't include them in our automatically generated clustering of phones.
mkdir -p $dir/phones
cat $dir/phones/silence.txt | awk '{printf("%s ", $1);} END{printf "\n";}' \
> $dir/phones/sets_mono.txt || exit 1;
cat $dir/phones/nonsilence.txt | \
perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_";
$phone=$1; $stress=$2; $position=$3;
if($phone eq $curphone){ print " $phone$stress$position"; }
else { if(defined $curphone){ print "\n"; } $curphone=$phone; print "$phone$stress$position"; }} print "\n"; ' \
>> $dir/phones/sets_mono.txt || exit 1;
grep -v -w `head -1 $dir/phones/silence.txt` $dir/phones/sets_mono.txt \
> $dir/phones/sets_cluster.txt || exit 1;
cat $dir/phones/silence.txt | awk '{printf("%s ", $1);} END{printf "\n";}' \
> $dir/phones/extra_questions.txt
cat $dir/phones/nonsilence.txt | perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_";
$phone=$1; $stress=$2; $pos=$3;
$full_phone ="$1$2$3";
$pos2list{$pos} = $pos2list{$pos} . $full_phone . " ";
$stress2list{$stress} = $stress2list{$stress} . $full_phone . " ";
}
foreach $k (keys %pos2list) { print "$pos2list{$k}\n"; }
foreach $k (keys %stress2list) { print "$stress2list{$k}\n"; } ' \
>> $dir/phones/extra_questions.txt || exit 1;
( # Creating the "roots file" for building the context-dependent systems...
# we share the roots across all the versions of each real phone. We also
# share the states of the 3 forms of silence. "not-shared" here means the
# states are distinct p.d.f.'s... normally we would automatically split on
# the HMM-state but we're not making silences context dependent.
cat $dir/phones/silence.txt | \
awk 'BEGIN {printf("not-shared not-split ");} {printf("%s ",$1);} END{printf "\n";}';
cat $dir/phones/nonsilence.txt | \
perl -e 'while(<>){ m:([A-Za-z]+)(\d*)(_.)?: || die "Bad line $_";
$phone=$1; $stress=$2; $position=$3;
if($phone eq $curphone){ print " $phone$stress$position"; }
else { if(defined $curphone){ print "\n"; } $curphone=$phone;
print "shared split $phone$stress$position"; }} print "\n"; '
) > $dir/phones/roots.txt || exit 1;
for x in sets_mono sets_cluster extra_questions disambig; do
# Create these lists of phones in colon-separated integer list form too,
# for purposes of being given to programs as command-line options.
for f in silence nonsilence disambig context_indep; do
utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt | \
awk '{printf(":%d", $1);} END{printf "\n"}' | sed s/:// > $dir/phones/$f.csl || exit 1;
done
for x in sets extra_questions; do
utils/sym2int.pl $dir/phones.txt <$dir/phones/$x.txt > $dir/phones/$x.int || exit 1;
done
......@@ -184,7 +163,6 @@ utils/sym2int.pl -f 1 $dir/phones.txt <$dir/phones/word_boundary.txt \
> $dir/phones/word_boundary.int || exit 1;
silphonelist=`cat $dir/phones/silence.csl | sed 's/:/ /g'`
nonsilphonelist=`cat $dir/phones/nonsilence.csl | sed 's/:/ /g'`
cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
......
......@@ -171,7 +171,6 @@ if ($utt2spk_file ne "") { # We have the --utt2spk option...
for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
$scpfn = $OUTPUTS[$scpidx];
open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
binmode(F, ":utf8");
$count = 0;
if(@{$scparray[$scpidx]} == 0) {
print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
......
......@@ -96,5 +96,3 @@ while (<>) {
}
exit(0);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment