Adding some CCTC training code

bbac0793 · Daniel Povey · Dan Povey · a0804fd8 · bbac0793 · bbac0793
Commit bbac0793 authored 9 years ago by Daniel Povey Committed by Dan Povey 9 years ago
--- a/src/ctc/cctc-training.cc
+++ b/src/ctc/cctc-training.cc
+// ctc/cctc-training.cc
+
+// Copyright      2015   Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "ctc/cctc-training.h"
+
+namespace kaldi {
+namespace ctc {
+
+void CctcTraining::Forward(const CtcSupervision &supervision,
+                           const CuMatrixBase<BaseFloat> &nnet_output,
+                           ForwardData *forward_data) {
+  KALDI_ASSERT(supervision.num_frames == nnet_output.NumRows() &&
+               supervision.
+}
+
+
+}  // namespace ctc
+}  // namespace kaldi
--- a/src/ctc/cctc-training.h
+++ b/src/ctc/cctc-training.h
+// ctc/cctc-training.h
+
+// Copyright       2015  Johns Hopkins University (Author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_CTC_CCTC_TRAINING_H_
+#define KALDI_CTC_CCTC_TRAINING_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "tree/context-dep.h"
+#include "lat/kaldi-lattice.h"
+#include "matrix/kaldi-matrix.h"
+#include "ctc/language-model.h"
+
+namespace kaldi {
+namespace ctc {
+
+// CTC means Connectionist Temporal Classification, see the paper by Graves et
+// al.  CCTC means context-dependent CTC, it's an extension of the original model,
+// in which the next phone is dependent on the phone history (actually, a truncation
+// thereof) in addition to the acoustic history.
+
+
+struct CctcTrainingOptions {
+  BaseFloat normalizing_weight;
+
+  CctcTrainingOptions(): normalizing_weight(0.0001) { }
+
+  void Register(OptionsItf *opts) {
+    opts->Register("normalizing-weight", &normalizing_weight, "Weight on a "
+                   "term in the objective function that's a squared "
+                   "log of the denominator in the CCTC likelihood; it "
+                   "exists to keep the network outputs in a reasonable "
+                   "range so we can exp() them without overflow.");
+  }
+  
+};
+
+// This class is not responsible for the entire process of CCTC model training;
+// it is only responsible for the forward-backward from the neural net output,
+// and the derivative computation.
+class CctcTraining {
+
+  struct ForwardData {
+
+    CuMatrix<BaseFloat> exp_nnet_output;
+    CuMatrix<BaseFloat> normalizers;
+
+    // The log-alpha value (forward score) for each state in the lattice.
+    Vector<double> alpha;
+    BaseFloat tot_like;
+  };
+  
+
+  CctcTraining(const CctcTrainingOptions &opts,
+               const CctcTransitionModel &trans_model):
+      opts_(opts), trans_model_(trans_model) { }
+
+  /**
+     This function does the forward computation, up to
+     computing the objective.
+   */
+  void Forward(const CtcSupervision &supervision,
+               const CuMatrixBase<BaseFloat> &nnet_output,
+               ForwardData *forward_data);
+
+  void Backward(const CtcSupervision &supervision,
+                const CuMatrixBase<BaseFloat> &nnet_output,
+                const ForwardData &forward_data,
+                CuMatrixBase<BaseFloat> *nnet_deriv);
+
+  
+ private:
+  CctcTrainingOptions &opts_;
+  const CctcTransitionModel &trans_model_;
+
+  // CUDA copy of trans_model_.Weights().
+  CuMatrix<BaseFloat> weights_;
+};
+
+
+
+}  // namespace ctc
+}  // namespace kaldi
+
+#endif  // KALDI_CTC_CCTC_TRAINING_H_
+
--- a/src/ctc/cctc-transition-model.cc
+++ b/src/ctc/cctc-transition-model.cc
@@ -217,18 +217,17 @@ void CctcTransitionModel::ComputeWeights() {
  int32 num_history_states = history_state_info_.size(),
      num_output_indexes = num_output_indexes_,
      num_phones = num_phones_;
-  Matrix<BaseFloat> weights(num_history_states,
-                            num_output_indexes);
+  weights_.Resize(num_history_states,
+                  num_output_indexes);
  for (int32 h = 0; h < num_history_states; h++) {
    const HistoryStateInfo &info = history_state_info_[h];
-    SubVector<BaseFloat> row(weights, h);
+    SubVector<BaseFloat> row(weights_, h);
    for (int32 p = 0; p <= num_phones; p++) {
      int32 output_index = info.output_index[p];
      BaseFloat lm_prob = info.phone_lm_prob(p);
      row(output_index) += lm_prob;
    }
  }
-  weights_.Swap(&weights);
 }

 CctcTransitionModelCreator::CctcTransitionModelCreator(

--- a/src/ctc/cctc-transition-model.h
+++ b/src/ctc/cctc-transition-model.h
-// lat/cctc-transition-model.h
+// ctc/cctc-transition-model.h

 // Copyright       2015  Johns Hopkins University (Author: Daniel Povey)

@@ -30,7 +30,7 @@
 #include "fstext/fstext-lib.h"
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
-#include "cudamatrix/cu-matrix.h"
+#include "matrix/kaldi-matrix.h"
 #include "ctc/language-model.h"

 namespace kaldi {
@@ -76,7 +76,7 @@ class CctcTransitionModel {
  // probabilities: row index is history-state index from 0 to
  // NumHistoryStates() - 1, column index is neural-net output index, from 0 to
  // NumOutputIndexes() - 1.
-  const CuMatrix<BaseFloat> &GetWeights() const { return weights_; }
+  const Matrix<BaseFloat> &GetWeights() const { return weights_; }
  
  // A graph-label is a similar concept to a transition-id in HMM-based models;
  // it's a one-based index that appears on the input side of a decoding graph
@@ -195,7 +195,7 @@ class CctcTransitionModel {
  // and its column dimension is the output dimension of the neural net.
  // A row of this will be dotted with the output of the neural net to
  // get the denominator of the probability value.
-  CuMatrix<BaseFloat> weights_;
+  Matrix<BaseFloat> weights_;
  
  friend class CctcTransitionModelCreator;
 };

--- a/src/doc/ctc.dox
+++ b/src/doc/ctc.dox
@@ -39,7 +39,15 @@ namespace kaldi {
  "Fast and Accurate Recurrent Neural Network Acoustic Models for Speech
  Recognition" by Hasim Sak, Andrew Senior et al.

- Todo: finish this intro.
+ What we are implementing is something we are calling Context-dependent CTC (CCTC),
+  which is an extension of the original CTC model whereby the next symbol is dependent not only on the acoustic history, but
+ also on the phonetic history.  And we train against a phone-level language model
+ in order to force the model to address only the acoustic side of things.
+ We also don't allow repeats of non-blank symbols.   The phonetic history is not
+ given as an input to the neural net; it all has to do with what sets of symbols
+ we normalize the probabilities over, which is phone-history dependent.
+ We'll have to create a more extensive introduction.
+


  \section Context-dependent CTC