Skip to content
Snippets Groups Projects
Commit 5dfa20aa authored by Daniel Povey's avatar Daniel Povey
Browse files

Removing little-used feature: time-reversed, and forward-backward, decoding.

parent f0fab215
No related branches found
No related tags found
No related merge requests found
Showing
with 18 additions and 261 deletions
......@@ -5,7 +5,6 @@
cmd=run.pl
min_lmwt=5
max_lmwt=17
reverse=false
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
......@@ -16,7 +15,6 @@ if [ $# -ne 3 ]; then
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -26,7 +24,7 @@ dir=$3
model=$dir/../final.mdl # assume model one level up from decoding dir.
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
hubdir=`dirname $hubscr`
......@@ -39,14 +37,14 @@ name=`basename $data`; # e.g. eval2000
mkdir -p $dir/scoring/log
function filter_text {
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
'[noise]' '[laughter]' '[vocalized-noise]' '<unk>' '%hesitation'
}
#function filter_text {
# perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
# perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
# while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
# foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
# '[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
......@@ -59,11 +57,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
for lmwt in `seq $min_lmwt $max_lmwt`; do
utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \
filter_text > $dir/scoring/$lmwt.txt || exit 1;
if $reverse; then
mv $dir/scoring/$lmwt.txt $dir/scoring/$lmwt.txt.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.txt.orig >$dir/scoring/$lmwt.txt
fi
done
filter_text <$data/text >$dir/scoring/text.filt
......
#prepare reverse lexicon and language model for backwards decoding
utils/prepare_lang.sh --reverse true data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp.reverse data/lang.reverse || exit 1;
utils/reverse_lm.sh data/local/nist_lm/lm_bg_5k.arpa.gz data/lang.reverse data/lang_test_bg_5k.reverse || exit 1;
utils/reverse_lm_test.sh data/lang_test_bg_5k data/lang_test_bg_5k.reverse || exit 1;
# normal forward decoding
utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_10 || exit 1;
# backward decoding
utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5k_r
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse10 || exit 1;
# pingpong decoding
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k_10 exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k_reverse10 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_pongping10 || exit 1;
# same for bigger language models (on machine with 8GB RAM, you can run the whole decoding in 3-4 min without SGE)
utils/prepare_lang.sh --reverse true data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger.reverse data/lang_bd.reverse || exit;
utils/reverse_lm.sh --lexicon data/local/dict_larger/lexicon.txt data/local/local_lm/3gram-mincount/lm_pr6.0.gz data/lang_bd.reverse data/lang_test_bd_tgpr.reverse || exit 1;
utils/reverse_lm_test.sh data/lang_test_bd_tgpr data/lang_test_bd_tgpr.reverse || exit 1;
utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri2a exp/tri2a/graph_bd_tgpr
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 4 --cmd run.pl \
exp/tri2a/graph_bd_tgpr data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_10 || exit 1;
utils/mkgraph.sh --reverse data/lang_test_bd_tgpr.reverse exp/tri2a exp/tri2a/graph_bd_tgpr_r
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 4 --cmd run.pl \
exp/tri2a/graph_bd_tgpr_r data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_reverse10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 4 --cmd run.pl \
--first_pass exp/tri2a/decode_eval92_bdtgpr4_10 exp/tri2a/graph_bd_tgpr_r data/test_eval92 \
exp/tri2a/decode_eval92_bdtgpr4_pingpong10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 4 --cmd run.pl \
--first_pass exp/tri2a/decode_eval92_bdtgpr4_reverse10 exp/tri2a/graph_bd_tgpr data/test_eval92 \
exp/tri2a/decode_eval92_bdtgpr4_pongping10 || exit 1;
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=7
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
min_lmwt=7
max_lmwt=17
#end configuration section.
......@@ -24,7 +23,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -46,14 +44,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
min_lmwt=7
max_lmwt=17
#end configuration section.
......@@ -24,7 +23,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -46,14 +44,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
min_lmwt=7
max_lmwt=17
#end configuration section.
......@@ -24,7 +23,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -46,14 +44,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \
"ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=5
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=5
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=5
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
min_lmwt=5
max_lmwt=17
reverse=false
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
......@@ -20,7 +19,6 @@ if [ $# -ne 3 ]; then
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -30,7 +28,7 @@ dir=$3
model=$dir/../final.mdl # assume model one level up from decoding dir.
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
#hubscr=$KALDI_ROOT/tools/sctk-2.4.0/bin/hubscr.pl
[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
hubdir=`dirname $hubscr`
......@@ -43,19 +41,19 @@ name=`basename $data`; # e.g. eval1
mkdir -p $dir/scoring/log
mkdir -p $dir/label
mkdir -p $dir/label/log
mkdir -p $dir/label/log
mkdir -p $dir/label/wer
function filter_text_mor {
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
'<UNK>'
}
function filter_text {
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
foreach $a (@A) { if (!defined $bad{$a}){ @W=split(/\+/,$a); $word=$W[0]; { print "$word "; }}} print "\n"; }' \
'<UNK>'
}
......@@ -68,14 +66,9 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
for lmwt in `seq $min_lmwt $max_lmwt`; do
utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \
filter_text > $dir/scoring/$lmwt.txt || exit 1;
utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \
filter_text_mor > $dir/label/${lmwt}-trans.text || exit 1;
if $reverse; then
mv $dir/scoring/$lmwt.txt $dir/scoring/$lmwt.txt.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.txt.orig >$dir/scoring/$lmwt.txt
fi
done
filter_text <$data/text >$dir/scoring/text.filt
......
......@@ -5,7 +5,6 @@
cmd=run.pl
min_lmwt=5
max_lmwt=17
reverse=false
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
......@@ -17,7 +16,6 @@ if [ $# -ne 3 ]; then
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -37,7 +35,7 @@ mkdir -p $dir/scoring/log
function filter_text {
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
'[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
......@@ -50,11 +48,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
for lmwt in `seq $min_lmwt $max_lmwt`; do
utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.tra | \
filter_text > $dir/scoring/$lmwt.txt || exit 1;
if $reverse; then
mv $dir/scoring/$lmwt.txt $dir/scoring/$lmwt.txt.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.txt.orig >$dir/scoring/$lmwt.txt
fi
done
filter_text <$data/text >$dir/scoring/text.filt
......
......@@ -12,7 +12,6 @@ cmd=run.pl
stage=0
min_lmwt=5
max_lmwt=17
reverse=false
word_ins_penalty=0.0,0.5,1.0
#end configuration section.
......@@ -26,7 +25,6 @@ if [ $# -ne 3 ]; then
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......
......@@ -5,7 +5,6 @@
cmd=run.pl
min_lmwt=5
max_lmwt=17
reverse=false
word_ins_penalty=0.0,0.5,1.0
#end configuration section.
......@@ -18,7 +17,6 @@ if [ $# -ne 3 ]; then
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -28,7 +26,7 @@ dir=$3
model=$dir/../final.mdl # assume model one level up from decoding dir.
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
hubdir=`dirname $hubscr`
......@@ -42,7 +40,7 @@ mkdir -p $dir/scoring/log
function filter_text {
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
perl -e 'foreach $w (@ARGV) { $bad{$w} = 1; }
while(<STDIN>) { @A = split(" ", $_); $id = shift @A; print "$id ";
foreach $a (@A) { if (!defined $bad{$a}) { print "$a "; }} print "\n"; }' \
'[NOISE]' '[LAUGHTER]' '[VOCALIZED-NOISE]' '<UNK>' '%HESITATION'
......@@ -58,11 +56,6 @@ for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
for lmwt in `seq $min_lmwt $max_lmwt`; do
utils/int2sym.pl -f 2- $lang/words.txt <$dir/scoring/$lmwt.${wip}.tra | \
filter_text > $dir/scoring/$lmwt.${wip}.txt || exit 1;
if $reverse; then
mv $dir/scoring/$lmwt.${wip}.txt $dir/scoring/$lmwt.${wip}.txt.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.${wip}.txt.orig >$dir/scoring/$lmwt.${wip}.txt
fi
done
done
......
......@@ -6,7 +6,6 @@ cmd=run.pl
stage=0
min_lmwt=5
max_lmwt=17
reverse=false
word_ins_penalty=0.0,0.5,1.0
#end configuration section.
......@@ -20,7 +19,6 @@ if [ $# -ne 3 ]; then
echo " --stage (0|1|2) # start scoring script from part-way through."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -30,7 +28,7 @@ dir=$3
model=$dir/../final.mdl # assume model one level up from decoding dir.
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
hubdir=`dirname $hubscr`
......@@ -39,13 +37,6 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
[ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done
align_word=
reorder_opt=
if $reverse; then
align_word="lattice-reverse ark:- ark:- |"
reorder_opt="--reorder=false"
fi
if [ -f $dir/../frame_shift ]; then
frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
......@@ -66,7 +57,7 @@ if [ $stage -le 0 ]; then
lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
lattice-1best ark:- ark:- \| \
$align_word lattice-align-words $reorder_opts $lang/phones/word_boundary.int $model ark:- ark:- \| \
lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
nbest-to-ctm $frame_shift_opt ark:- - \| \
utils/int2sym.pl -f 5 $lang/words.txt \| \
utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
......@@ -95,7 +86,7 @@ fi
# Score the set...
if [ $stage -le 2 ]; then
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
$hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1;
......@@ -106,7 +97,7 @@ fi
case "$name" in eval2000* )
# Score only the, swbd part...
if [ $stage -le 3 ]; then
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \
grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \
grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \
......@@ -115,7 +106,7 @@ case "$name" in eval2000* )
fi
# Score only the, callhome part...
if [ $stage -le 3 ]; then
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \
grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \
grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \
......@@ -125,7 +116,7 @@ case "$name" in eval2000* )
;;
rt03* )
# Score only the swbd part...
if [ $stage -le 3 ]; then
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=7
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -9,7 +9,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0,0.5,1.0
min_lmwt=7
max_lmwt=17
......@@ -27,7 +26,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -53,16 +51,6 @@ for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1;
done
if $reverse; then
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.$wip.tra $dir/scoring/$lmwt.$wip.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.$wip.tra.orig >$dir/scoring/$lmwt.$wip.tra
done
done
fi
# Note: the double level of quoting for the sed command
for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=7
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
#prepare reverse lexicon and language model for backwards decoding
utils/prepare_lang.sh --reverse true data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp.reverse data/lang.reverse || exit 1;
utils/reverse_lm.sh data/local/nist_lm/lm_bg_5k.arpa.gz data/lang.reverse data/lang_test_bg_5k.reverse || exit 1;
utils/reverse_lm_test.sh data/lang_test_bg_5k data/lang_test_bg_5k.reverse || exit 1;
# normal forward decoding
utils/mkgraph.sh data/lang_test_bg_5k exp/tri2a exp/tri2a/graph_bg5k
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_10 || exit 1;
# backward decoding
utils/mkgraph.sh --reverse data/lang_test_bg_5k.reverse exp/tri2a exp/tri2a/graph_bg5k_r
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 8 --cmd "$decode_cmd" \
exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_reverse10 || exit 1;
# pingpong decoding
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k_10 exp/tri2a/graph_bg5k_r data/test_eval92 exp/tri2a/decode_eval92_bg5k_pingpong10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 8 --cmd "$decode_cmd" \
--first_pass exp/tri2a/decode_eval92_bg5k_reverse10 exp/tri2a/graph_bg5k data/test_eval92 exp/tri2a/decode_eval92_bg5k_pongping10 || exit 1;
# same for bigger language models (on machine with 8GB RAM, you can run the whole decoding in 3-4 min without SGE)
utils/prepare_lang.sh --reverse true data/local/dict_larger "<SPOKEN_NOISE>" data/local/lang_larger.reverse data/lang_bd.reverse || exit;
utils/reverse_lm.sh --lexicon data/local/dict_larger/lexicon.txt data/local/local_lm/3gram-mincount/lm_pr6.0.gz data/lang_bd.reverse data/lang_test_bd_tgpr.reverse || exit 1;
utils/reverse_lm_test.sh data/lang_test_bd_tgpr data/lang_test_bd_tgpr.reverse || exit 1;
utils/mkgraph.sh data/lang_test_bd_tgpr exp/tri2a exp/tri2a/graph_bd_tgpr
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --nj 4 --cmd run.pl \
exp/tri2a/graph_bd_tgpr data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_10 || exit 1;
utils/mkgraph.sh --reverse data/lang_test_bd_tgpr.reverse exp/tri2a exp/tri2a/graph_bd_tgpr_r
steps/decode_fwdbwd.sh --beam 10.0 --latbeam 4.0 --reverse true --nj 4 --cmd run.pl \
exp/tri2a/graph_bd_tgpr_r data/test_eval92 exp/tri2a/decode_eval92_bdtgpr4_reverse10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --reverse true --nj 4 --cmd run.pl \
--first_pass exp/tri2a/decode_eval92_bdtgpr4_10 exp/tri2a/graph_bd_tgpr_r data/test_eval92 \
exp/tri2a/decode_eval92_bdtgpr4_pingpong10 || exit 1;
steps/decode_fwdbwd.sh --beam 10.0 --max-beam 20.0 --nj 4 --cmd run.pl \
--first_pass exp/tri2a/decode_eval92_bdtgpr4_reverse10 exp/tri2a/graph_bd_tgpr data/test_eval92 \
exp/tri2a/decode_eval92_bdtgpr4_pongping10 || exit 1;
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=7
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
......@@ -8,7 +8,6 @@
cmd=run.pl
stage=0
decode_mbr=true
reverse=false
word_ins_penalty=0.0
min_lmwt=7
max_lmwt=17
......@@ -25,7 +24,6 @@ if [ $# -ne 3 ]; then
echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)."
echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring "
echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring "
echo " --reverse (true/false) # score with time reversed features "
exit 1;
fi
......@@ -49,14 +47,6 @@ $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
lattice-best-path --word-symbol-table=$symtab \
ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
if $reverse; then
for lmwt in `seq $min_lmwt $max_lmwt`; do
mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
<$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
done
fi
# Note: the double level of quoting for the sed command
$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
cat $dir/scoring/LMWT.tra \| \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment