Merge pull request #145 from david-ryan-snyder/nnet-am-compute-chunked

Adding nnet2 function NnetComputationChunked

Merge pull request #145 from david-ryan-snyder/nnet-am-compute-chunked
cb05ff96 · Daniel Povey · 0628d2db · ab57be7b · cb05ff96 · cb05ff96
Commit cb05ff96 authored 9 years ago by Daniel Povey
--- a/src/nnet2/nnet-compute-test.cc
+++ b/src/nnet2/nnet-compute-test.cc
 // nnet2/nnet-compute-test.cc

 // Copyright 2014  Johns Hopkins University (author:  Daniel Povey)
+// Copyright 2015  David Snyder

 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -80,6 +81,41 @@ void UnitTestNnetCompute() {
  delete nnet;
 }

+void UnitTestNnetComputeChunked() {
+  int32 input_dim = 10 + rand() % 40, output_dim = 100 + rand() % 500;
+  bool pad_input = true;
+  
+  Nnet *nnet = GenRandomNnet(input_dim, output_dim);
+  int32 num_feats = 100 + rand() % 500;
+  int32 chunk_size = num_feats / (2 + rand() % 10);
+  CuMatrix<BaseFloat> input(num_feats, input_dim);
+  input.SetRandn();
+
+  KALDI_LOG << "Left context = " << nnet->LeftContext() 
+            << ", right context = " << nnet->RightContext() 
+            << ", chunk size = " << chunk_size;
+  KALDI_LOG << "NNet info is " << nnet->Info();
+
+  int32 num_output_rows = num_feats;
+  CuMatrix<BaseFloat> cu_output1(num_output_rows, output_dim);
+  Matrix<BaseFloat> output2(num_output_rows, output_dim);
+  NnetComputation(*nnet, input, pad_input, &cu_output1);
+  NnetComputationChunked(*nnet, Matrix<BaseFloat>(input), chunk_size, 
+                         &output2);
+  Matrix<BaseFloat> output1(cu_output1);
+  AssertEqual(output1, output2);
+  for (int32 i = 0; i < output1.NumRows(); i++) {
+    // just double-check that the frames near the end are right, in case
+    // the test above somehow passed despite that.
+    if (i < 10 || output1.NumRows() - i < 10) {
+      SubVector<BaseFloat> vec1(output1, i), vec2(output2, i);
+      AssertEqual(vec1, vec2);
+    }
+  }
+  KALDI_LOG << "OK";
+  delete nnet;
+}
+
 }  // namespace nnet2
 }  // namespace kaldi

@@ -92,6 +128,7 @@ int main() {

  for (int32 i = 0; i < 10; i++) 
    UnitTestNnetCompute();
+    UnitTestNnetComputeChunked();
  return 0;
 }
  
--- a/src/nnet2/nnet-compute.cc
+++ b/src/nnet2/nnet-compute.cc
 // nnet2/nnet-compute.cc

 // Copyright 2012   Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015   David Snyder

 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -165,6 +166,45 @@ void NnetComputation(const Nnet &nnet,
  output->CopyFromMat(nnet_computer.GetOutput());
 }

+void NnetComputationChunked(const Nnet &nnet,
+                     const Matrix<BaseFloat> &input,  // features
+                     int32 chunk_size,
+                     Matrix<BaseFloat> *output) {
+  int32 num_rows,
+       num_chunks = ceil((BaseFloat)input.NumRows() / chunk_size),
+       dim = input.NumCols(),
+       left_context = nnet.LeftContext(),
+       right_context = nnet.RightContext();
+  Matrix<BaseFloat> full_input;
+  num_rows = left_context + input.NumRows() + right_context;
+  full_input.Resize(num_rows, dim);
+  full_input.Range(left_context, input.NumRows(),
+            0, dim).CopyFromMat(input);
+  for (int32 i = 0; i < left_context; i++)
+    full_input.Row(i).CopyFromVec(input.Row(0));
+  int32 last_row = input.NumRows() - 1;
+  for (int32 i = 0; i < right_context; i++)
+    full_input.Row(num_rows - i - 1).CopyFromVec(input.Row(last_row));
+
+  for (int32 i = 0; i < num_chunks; i++) {
+    int32 index = i * chunk_size,
+          offset = std::min(num_rows - chunk_size * i, 
+                            left_context + chunk_size + right_context);
+    SubMatrix<BaseFloat> chunk_input(full_input, index, offset, 0, dim);
+    CuMatrix<BaseFloat> cu_chunk_input(chunk_input);
+
+    // Note: we have already accounted for input padding, so we pass
+    // pad_input==false to the NnetComputer.
+    NnetComputer nnet_computer(nnet, cu_chunk_input, false, NULL);
+    nnet_computer.Propagate();
+    CuMatrix<BaseFloat> cu_chunk_output(nnet_computer.GetOutput());
+    SubMatrix<BaseFloat> chunk_out(*output, i * chunk_size, 
+                           cu_chunk_output.NumRows(), 0, 
+                           cu_chunk_output.NumCols());
+    chunk_out.CopyFromMat(cu_chunk_output);
+  }
+}
+
 BaseFloat NnetGradientComputation(const Nnet &nnet,
                                  const CuMatrixBase<BaseFloat> &input,
                                  bool pad_input,

--- a/src/nnet2/nnet-compute.h
+++ b/src/nnet2/nnet-compute.h
 // nnet2/nnet-compute.h

 // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2015  David Snyder

 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -45,6 +46,19 @@ void NnetComputation(const Nnet &nnet,
                     const CuMatrixBase<BaseFloat> &input,  // features
                     bool pad_input,
                     CuMatrixBase<BaseFloat> *output); // posteriors.
+/**
+  Does the basic neural net computation, on a sequence of data (e.g.
+  an utterance).  This variant of NnetComputation chunks the input
+  according to chunk_size and does the posterior computation chunk 
+  by chunk.  This allows the computation to be performed on the GPU
+  when the input matrix is very large.  Input is padded with enough
+  frames of context so that the output will be a matrix with 
+  input.NumRows().
+*/
+void NnetComputationChunked(const Nnet &nnet,
+                     const Matrix<BaseFloat> &input,  // features
+                     int32 chunk_size,
+                     Matrix<BaseFloat> *output); // posteriors.

 /** Does the neural net computation and backprop, given input and labels.
    Note: if pad_input==true the number of rows of input should be the

--- a/src/nnet2bin/nnet-am-compute.cc
+++ b/src/nnet2bin/nnet-am-compute.cc
@@ -2,6 +2,7 @@

 // Copyright 2012  Johns Hopkins University (author:  Daniel Povey)
 //           2015  Johns Hopkins University (author:  Daniel Garcia-Romero)
+//           2015  David Snyder

 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -44,6 +45,7 @@ int main(int argc, char *argv[]) {
    bool apply_log = false;
    bool pad_input = true;
    std::string use_gpu = "no";
+    int32 chunk_size = 0;
    ParseOptions po(usage);
    po.Register("apply-log", &apply_log, "Apply a log to the result of the computation "
                "before outputting.");
@@ -52,6 +54,9 @@ int main(int argc, char *argv[]) {
                "of output being less than those of input.");
    po.Register("use-gpu", &use_gpu,
                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("chunk-size", &chunk_size, "Process the feature matrix in chunks.  "
+                "This is useful when processing large feature files in the GPU.  "
+                "If chunk-size > 0, pad-input must be true.");
    
    po.Read(argc, argv);
    
@@ -59,6 +64,9 @@ int main(int argc, char *argv[]) {
      po.PrintUsage();
      exit(1);
    }
+    // If chunk_size is greater than 0, pad_input needs to be true.
+    KALDI_ASSERT(chunk_size < 0 || pad_input);
+
 #if HAVE_CUDA==1
    CuDevice::Instantiate().SelectGpuId(use_gpu);
 #endif
@@ -79,12 +87,12 @@ int main(int argc, char *argv[]) {
    Nnet &nnet = am_nnet.GetNnet();
    
    int64 num_done = 0, num_frames = 0;
-    SequentialBaseFloatCuMatrixReader feature_reader(features_rspecifier);
-    BaseFloatCuMatrixWriter writer(features_or_loglikes_wspecifier);
+    SequentialBaseFloatMatrixReader feature_reader(features_rspecifier);
+    BaseFloatMatrixWriter writer(features_or_loglikes_wspecifier);
    
    for (; !feature_reader.Done();  feature_reader.Next()) {
      std::string utt = feature_reader.Key();
-      const CuMatrix<BaseFloat> &feats  = feature_reader.Value();
+      const Matrix<BaseFloat> &feats  = feature_reader.Value();

      int32 output_frames = feats.NumRows(), output_dim = nnet.OutputDim();
      if (!pad_input)
@@ -94,8 +102,16 @@ int main(int argc, char *argv[]) {
                   << "would be empty.";
        continue;
      }
-      CuMatrix<BaseFloat> output(output_frames, output_dim);
-      NnetComputation(nnet, feats, pad_input, &output);
+
+      Matrix<BaseFloat> output(output_frames, output_dim);
+      if (chunk_size > 0 && chunk_size < feats.NumRows()) {
+        NnetComputationChunked(nnet, feats, chunk_size, &output);
+      } else {
+        CuMatrix<BaseFloat> cu_feats(feats);
+        CuMatrix<BaseFloat> cu_output(output);
+        NnetComputation(nnet, cu_feats, pad_input, &cu_output);
+        output.CopyFromMat(cu_output);
+      }

      if (apply_log) {
        output.ApplyFloor(1.0e-20);