remove the compile-time dependency on the libcuda.so, use dynamic loading instead

git-svn-id: https://svn.code.sf.net/p/kaldi/code/trunk@1758 5e6a8d80-dfce-4ca6-a32a-6e07a63d50c8

remove the compile-time dependency on the libcuda.so, use dynamic loading instead
7d4bbace · Karel Vesely · 56f5d5cc · 7d4bbace · 7d4bbace · 7d4bbace
Commit 7d4bbace authored 12 years ago by Karel Vesely
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -23,6 +23,7 @@
 #include <cuda.h>

 #include <vector>
+#include <dlfcn.h>

 #include "cudamatrix/cu-common.h"
 #include "cudamatrix/cu-device.h"
@@ -48,7 +49,7 @@ CuDevice::CuDevice()
        || gpu_prop.computeMode == cudaComputeModeExclusiveProcess) {
      cudaDeviceSynchronize();
      char gpu_name[128];
-      cuDeviceGetName(gpu_name, 128, gpu_id);
+      DeviceGetName(gpu_name, 128, gpu_id);
      std::string mem_stats = GetFreeMemory(NULL, NULL);
      KALDI_LOG << "CUDA setup operating under Compute Exclusive Mode.\n"
                << "  Using device " << gpu_id << ": " << gpu_name << "\t" << mem_stats;
@@ -70,7 +71,7 @@ CuDevice::CuDevice()
          cudaThreadSynchronize(); //deprecated, but for legacy reason...
          //get GPU name
          char name[128];
-          cuDeviceGetName(name,128,n);
+          DeviceGetName(name,128,n);
          //get GPU memory stats
          int64 free, total;
          std::string mem_stats;
@@ -202,13 +203,41 @@ void CuDevice::PrintProfile() {
 std::string CuDevice::GetFreeMemory(int64* free, int64* total) {
 // WARNING! the CUDA API is inconsistent accross versions!
 #if (CUDA_VERSION >= 3020)
+  //define the function signature type
  size_t mem_free, mem_total;
 #else
  unsigned int mem_free, mem_total;
 #endif
-  // get the free memory stats
-  cuMemGetInfo(&mem_free, &mem_total);
-  // post them outside
+  { 
+    //we will load the cuMemGetInfo dynamically from libcuda.so
+    //cuMemGetInfo(&mem_free, &mem_total);
+    //pre-fill ``safe'' values that will not cause problems
+    mem_free = 1; mem_total = 1;
+    //open libcuda.so
+    void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+    if(NULL == libcuda) { 
+      KALDI_WARN << "cannot open libcuda.so"; 
+    } else {
+      //define the function signature type
+      //and get the symbol
+#if (CUDA_VERSION >= 3020)
+      typedef CUresult (*cu_fun_ptr)(size_t*, size_t*);
+      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); 
+#else
+      typedef CUresult (*cu_fun_ptr)(int*, int*);
+      cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); 
+#endif
+      if(NULL == dl_cuMemGetInfo) {
+        KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so";
+      } else {
+        //call the function
+        dl_cuMemGetInfo(&mem_free, &mem_total);
+      }
+      //close the library
+      dlclose(libcuda);
+    }
+  }
+  // copy the output values outside
  if(NULL != free) *free = mem_free;
  if(NULL != total) *total = mem_total;
  // prepare the text output
@@ -221,6 +250,29 @@ std::string CuDevice::GetFreeMemory(int64* free, int64* total) {
 }


+void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) {
+  //prefill with something reasonable
+  strncpy(name,"Unknown GPU",len);
+  //open libcuda.so
+  void* libcuda = dlopen("libcuda.so",RTLD_LAZY);
+  if(NULL == libcuda) {
+    KALDI_WARN << "cannot open libcuda.so"; 
+  } else {
+    //define the function signature type
+    typedef CUresult (*cu_fun_ptr)(char*,int,CUdevice);
+    //get the symbol
+    cu_fun_ptr cuDeviceGetName_ptr = (cu_fun_ptr)dlsym(libcuda,"cuDeviceGetName"); 
+    if(NULL == cuDeviceGetName_ptr) {
+      KALDI_WARN << "cannot load cuDeviceGetName from libcuda.so"; 
+    } else {
+      //call the function
+      cuDeviceGetName_ptr(name, len, dev);
+    }
+    //close the library
+    dlclose(libcuda);
+  }
+}
+

 ////////////////////////////////////////////////
 // The instance of the static singleton 

--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -79,6 +79,8 @@ class CuDevice {
  
  /// Get the actual GPU memory use stats
  std::string GetFreeMemory(int64* free = NULL, int64* total = NULL);
+  /// Get the name of the GPU
+  void DeviceGetName(char* name, int32 len, int32 dev); 


 private:

--- a/src/makefiles/linux_cuda.mk
+++ b/src/makefiles/linux_cuda.mk
@@ -4,5 +4,5 @@ CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA

 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
 LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib
-LDFLAGS += -lcublas -lcudart -lcuda
+LDFLAGS += -lcublas -lcudart

--- a/src/makefiles/linux_x86_64_cuda.mk
+++ b/src/makefiles/linux_x86_64_cuda.mk
@@ -4,5 +4,5 @@ CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA

 CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
 LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-LDFLAGS += -lcublas -lcudart -lcuda
+LDFLAGS += -lcublas -lcudart

--- a/src/nnetbin/cmvn-to-nnet.cc
+++ b/src/nnetbin/cmvn-to-nnet.cc
-// gmmbin/transf-to-nnet.cc
+// gmmbin/cmvn-to-nnet.cc

 // Copyright 2012  Brno University of Technology

@@ -33,9 +33,11 @@ int main(int argc, char *argv[]) {


    bool binary_write = false;
+    bool tied_normalzation = false;
    
    ParseOptions po(usage);
    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("tied-normalization", &tied_normalzation, "The normalization is tied accross all the input dimensions");

    po.Read(argc, argv);

@@ -68,8 +70,19 @@ int main(int argc, char *argv[]) {
    for(int32 d=0; d<cmvn_stats.NumCols()-1; d++) {
      BaseFloat mean = cmvn_stats(0,d)/count;
      BaseFloat var = cmvn_stats(1,d)/count - mean*mean;
+      shift(d) = -mean;
      scale(d) = 1.0 / sqrt(var);
-      shift(d) = -mean * scale(d);
+    }
+
+    if(tied_normalzation) {
+      //just average the variances
+      BaseFloat sum_var = 0.0;
+      for(int32 i=0; i<scale.Dim(); i++) {
+        sum_var += 1.0 / (scale(i)*scale(i));
+      }
+      BaseFloat mean_var = sum_var / scale.Dim();
+      BaseFloat tied_scale = 1.0 / sqrt(mean_var);
+      scale.Set(tied_scale);
    }

    //we will put the shift and scale to the nnet