Add support for CNMeM library.

Update the code to support CNMeM (formerly known as Cumem).

Add support for CNMeM library.
130d2ce9 · Julien Demouth · Frederic · 389c4aba · 130d2ce9 · 130d2ce9
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,6 +2,7 @@ global-include *.txt
 global-include *.c
 global-include *.cu
 global-include *.cuh
+global-include *.cpp
 global-include *.h
 global-include *.sh
 global-include *.pkl

--- a/setup.py
+++ b/setup.py
@@ -164,7 +164,7 @@ def do_setup():
          install_requires=['numpy>=1.6.2', 'scipy>=0.11', 'six>=1.9.0'],
          package_data={
              '': ['*.txt', '*.rst', '*.cu', '*.cuh', '*.c', '*.sh', '*.pkl',
-                   '*.h', 'ChangeLog'],
+                   '*.h', '*.cpp', 'ChangeLog'],
              'theano.misc': ['*.sh']
          },
          scripts=['bin/theano-cache', 'bin/theano-nose', 'bin/theano-test'],

--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -54,8 +54,8 @@ AddConfigVar('cublas.lib',
        """Name of the cuda blas library for the linker.""",
        StrParam('cublas'))

-AddConfigVar('lib.cumem',
-             """Do we enable cumem or not.""",
+AddConfigVar('lib.cnmem',
+             """Do we enable cnmem or not.""",
             # We should not mix both allocator, so we can't override
             BoolParam(False, allow_override=False),
             in_c_key=False)
@@ -385,7 +385,7 @@ def use(device,
        try:
            if (device != 'gpu') and not pycuda_init_dev:
                assert isinstance(device, int)
-                gpu_init(device, config.lib.cumem)
+                gpu_init(device, config.lib.cnmem)
                use.device_number = device
                assert active_device_number() == device
            else:
@@ -398,7 +398,7 @@ def use(device,
                cuda_ndarray.cuda_ndarray.select_a_gpu()
                use.device_number = active_device_number()
                # This is needed to initialize the cublas handle.
-                gpu_init(use.device_number, config.lib.cumem)
+                gpu_init(use.device_number, config.lib.cnmem)

            if test_driver:
                import theano.sandbox.cuda.tests.test_driver
@@ -411,8 +411,9 @@ def use(device,
                                 " this property")

            if config.print_active_device:
-                print("Using gpu device %d: %s" % (
-                        active_device_number(), active_device_name()), file=sys.stderr)
+                cnmem_enabled = "enabled" if config.lib.cnmem else "disabled"
+                print("Using gpu device %d: %s (cnmem is %s)" % (
+                        active_device_number(), active_device_name(), cnmem_enabled), file=sys.stderr)
            if device_properties(use.device_number)['regsPerBlock'] < 16384:
                # We will try to use too much register per bloc at many places
                # when there is only 8k register per multi-processor.

--- a/theano/sandbox/cuda/cnmem.cpp
+++ b/theano/sandbox/cuda/cnmem.cpp
--- a/theano/sandbox/cuda/cumem.h
+++ b/theano/sandbox/cuda/cumem.h
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -9,8 +9,8 @@

 #include "cuda_ndarray.cuh"

-#include "cumem.h"
-#include "cumem.cpp"
+#include "cnmem.h"
+#include "cnmem.cpp"

 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
 #define COMPUTE_GPU_MEM_USED 0
@@ -71,20 +71,20 @@ void * device_malloc(size_t size)
 }

 ///@TODO: thejaswi: link this option to a theano config variable?
-static bool g_use_cumem = false;
+static bool g_use_cnmem = false;
 static const int g_max_devices = 8;
-int initCumem(int card_number_provided, int card_nb) {
-    static bool cumemInitialized = false;
-    if(cumemInitialized) {
+int initCnmem(int card_number_provided, int card_nb) {
+    static bool cnmemInitialized = false;
+    if(cnmemInitialized) {
        return 0;
    }
    // On stderr to be at the same place as "Using gpu device..."
-    fprintf(stderr, "Initializing cumem...\n");
+    fprintf(stderr, "Initializing cnmem...\n");
    int numDevices = 0;
-    cumemDevice_t devices[g_max_devices];
+    cnmemDevice_t devices[g_max_devices];
    if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
        PyErr_Format(PyExc_RuntimeError,
-                     "initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n",
+                     "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
                     cudaGetErrorString(cudaGetLastError()));
        return -1;
    }
@@ -97,7 +97,6 @@ int initCumem(int card_number_provided, int card_nb) {
        ///@TODO: thejaswi: add support for multiple streams
        devices[i].numStreams = 0;
        devices[i].streams = NULL;
-        devices[i].granularity = 0;

    }else{
        for(int i=0;i<numDevices;++i) {
@@ -107,19 +106,18 @@ int initCumem(int card_number_provided, int card_nb) {
            ///@TODO: thejaswi: add support for multiple streams
            devices[i].numStreams = 0;
            devices[i].streams = NULL;
-            devices[i].granularity = 0;
        }
    }

-    ///@TODO: thejaswi: passing custom cumem flags?
-    cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT);
-    if(status != CUMEM_STATUS_SUCCESS) {
+    ///@TODO: thejaswi: passing custom cnmem flags?
+    cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
+    if(status != CNMEM_STATUS_SUCCESS) {
        PyErr_Format(PyExc_RuntimeError,
-                     "initCumem: cumemInit call failed! Reason=%s. numdev=%d\n",
-                     cumemGetErrorString(status), numDevices);
+                     "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
+                     cnmemGetErrorString(status), numDevices);
        return -1;
    }
-    cumemInitialized = true;
+    cnmemInitialized = true;
    return 0;
 }

@@ -138,12 +136,15 @@ void * device_malloc(size_t size, int verbose)
    #endif
    void * rval=NULL;
    ///@TODO: thejaswi: support for multiple-streams?
-    if(g_use_cumem) {
-        cumemStatus_t status = cumemMalloc(&rval, size, NULL);
-        if(status != CUMEM_STATUS_SUCCESS) {
+    if(g_use_cnmem) {
+        cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
+        if( size != 0 ) {
+            status = cnmemMalloc(&rval, size, NULL);
+        }
+        if(status != CNMEM_STATUS_SUCCESS) {
            PyErr_Format(PyExc_MemoryError,
                         "Error allocating %zd bytes of device memory (%s).",
-                         size, cumemGetErrorString(status));
+                         size, cnmemGetErrorString(status));
            return NULL;
        }
    }
@@ -271,11 +272,11 @@ int device_free(void *ptr)
    }

    ///@TODO: thejaswi: multi-stream support
-    if(g_use_cumem) {
-        cumemStatus_t status = cumemFree(ptr, NULL);
-        if(status != CUMEM_STATUS_SUCCESS) {
-            fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n",
-                    cumemGetErrorString(status));
+    if(g_use_cnmem) {
+        cnmemStatus_t status = cnmemFree(ptr, NULL);
+        if(status != CNMEM_STATUS_SUCCESS) {
+            fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
+                    cnmemGetErrorString(status));
        }
    }
    else {
@@ -3134,22 +3135,22 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
 static int cublas_init();
 static void cublas_shutdown();
 // Initialize the gpu.
-// Takes two optional parameters, the device number and if we should use cumem.
+// Takes two optional parameters, the device number and if we should use cnmem.
 // If the device number is provided, it sets that device to be the active device.
 // If not provided (usually just to test whether the gpu is available at all),
 // it does not set an active device.
 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
-// cumem is threaded like a bool. If converted to 0, don't use cumem. Otherwise, use it.
+// cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
 PyObject *
 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
 {
    int card_nb = 0;
    int card_number_provided = 1;
-    int cumem = 0; // 0 False, 1 True
+    int cnmem = 0; // 0 False, 1 True
    // if we're given something wildly invalid, this will throw a TypeError
-    PyArg_ParseTuple(args, "|ii", &card_nb, &cumem);
-    if(cumem)
-        g_use_cumem = true;
+    PyArg_ParseTuple(args, "|ii", &card_nb, &cnmem);
+    if(cnmem)
+        g_use_cnmem = true;

    if(PyTuple_Size(args) == 0) {
        card_number_provided = 0;
@@ -3204,8 +3205,8 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        if (cublas_init() == -1)
            return NULL;
    }
-    if(card_number_provided && g_use_cumem) {
-        if(initCumem(card_number_provided, card_nb) == -1){
+    if(card_number_provided && g_use_cnmem) {
+        if(initCnmem(card_number_provided, card_nb) == -1){
            return NULL;
        }
    }
@@ -3240,13 +3241,13 @@ CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    // Don't handle errors here
    cublas_shutdown();
    g_gpu_context_active = 0; // context has now been closed down
-    if(g_use_cumem) {
-        fprintf(stderr, "Shutting down cumem...\n");
-        cumemStatus_t status = cumemFinalize();
-        if(status != CUMEM_STATUS_SUCCESS) {
-            fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n",
-                    cumemGetErrorString(status));
-            if(status == CUMEM_STATUS_CUDA_ERROR) {
+    if(g_use_cnmem) {
+        fprintf(stderr, "Shutting down cnmem...\n");
+        cnmemStatus_t status = cnmemFinalize();
+        if(status != CNMEM_STATUS_SUCCESS && status != CNMEM_STATUS_MEMORY_LEAK) {
+            fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
+                    cnmemGetErrorString(status));
+            if(status == CNMEM_STATUS_CUDA_ERROR) {
                fprintf(stderr, "  Cuda-Reason=%s\n",
                        cudaGetErrorString(cudaGetLastError()));
            }

--- a/theano/sandbox/cuda/cumem.cpp
+++ b/theano/sandbox/cuda/cumem.cpp