First cumem version

7eb58359 · Frederic · 03e77233 · 7eb58359 · 7eb58359 · 7eb58359
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -9,6 +9,9 @@
 #include "cuda_ndarray.cuh"
+#include "cumem.h"
+#include "cumem.cpp"
 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
 #define COMPUTE_GPU_MEM_USED 0
@@ -67,6 +70,41 @@ void * device_malloc(size_t size)
    return device_malloc(size, VERBOSE_DEVICE_MALLOC);
 }
+///@TODO: thejaswi: link this option to a theano config variable?
+static bool g_use_cumem = true;
+static const int g_max_devices = 8;
+int initCumem() {
+    static bool cumemInitialized = false;
+    if(cumemInitialized) {
+        return 0;
+    }
+    printf("Initializing cumem...\n");
+    int numDevices = 0;
+    cumemDevice_t devices[g_max_devices];
+    if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
+        fprintf(stderr, "initCumem: 'cudaGetDeviceCount' failed! Reason=%s\n",
+                cudaGetErrorString(cudaGetLastError()));
+        return -1;
+    }
+    for(int i=0;i<numDevices;++i) {
+        devices[i].device = i;
+        ///@TODO: thejaswi: support for choosing mem size to be allocated before-hand?
+        devices[i].size = 0;
+        ///@TODO: thejaswi: add support for multiple streams
+        devices[i].numStreams = 0;
+        devices[i].streams = NULL;
+    }
+    ///@TODO: thejaswi: passing custom cumem flags?
+    cumemStatus_t status = cumemInit(numDevices, devices, CUMEM_FLAGS_DEFAULT);
+    if(status != CUMEM_STATUS_SUCCESS) {
+        fprintf(stderr, "initCumem: cumemInit call failed! Reason=%s\n",
+                cumemGetErrorString(status));
+        return -1;
+    }
+    cumemInitialized = true;
+    return 0;
+}
 void * device_malloc(size_t size, int verbose)
 {
    #if PRECHECK_ERROR
@@ -81,6 +119,16 @@ void * device_malloc(size_t size, int verbose)
        }
    #endif
    void * rval=NULL;
+    ///@TODO: thejaswi: support for multiple-streams?
+    if(g_use_cumem) {
+        cumemStatus_t status = cumemMalloc(&rval, size, NULL);
+        if(status != CUMEM_STATUS_SUCCESS) {
+            fprintf(stderr, "device_malloc: cumemMallocAysnc call failed! Reason=%s\n",
+                    cumemGetErrorString(status));
+            return NULL;
+        }
+    }
+    else {
        cudaError_t err = cudaMalloc(&rval, size);
        if (cudaSuccess != err)
        {
@@ -118,6 +166,7 @@ void * device_malloc(size_t size, int verbose)
                         size, cudaGetErrorString(err));
            return NULL;
        }
+    }
    if (rval != NULL){
        // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
        // Could this be what happen if size is 0?
@@ -202,6 +251,15 @@ int device_free(void *ptr)
        return 0;
    }
+    ///@TODO: thejaswi: multi-stream support
+    if(g_use_cumem) {
+        cumemStatus_t status = cumemFree(ptr, NULL);
+        if(status != CUMEM_STATUS_SUCCESS) {
+            fprintf(stderr, "device_free: cumemFree call failed! Reason=%s\n",
+                    cumemGetErrorString(status));
+        }
+    }
+    else {
        // We need sync as the Theano's GC could remove intermediate variable that
        // are still needed as the gpu kernel are running or in the queue.
        CNDA_BEGIN_ALLOW_THREADS
@@ -259,6 +317,7 @@ int device_free(void *ptr)
                    cudaGetErrorString(err));
            return -1;
        }
+    }
    _outstanding_mallocs[0] -= (ptr != NULL);
    #if COMPUTE_GPU_MEM_USED
        int i=0;
@@ -3096,6 +3155,10 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        if (cublas_init() == -1)
            return NULL;
    }
+    if(g_use_cumem) {
+        if(initCumem() == -1)
+            return NULL;
+    }
    Py_INCREF(Py_None);
    return Py_None;
@@ -3126,8 +3189,21 @@ PyObject *
 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
    // Don't handle errors here
    cublas_shutdown();
-    cudaThreadExit();
    g_gpu_context_active = 0; // context has now been closed down
+    if(g_use_cumem) {
+        printf("Shutting down cumem...\n");
+        cumemStatus_t status = cumemFinalize();
+        if(status != CUMEM_STATUS_SUCCESS) {
+            fprintf(stderr, "CudaNdarray_gpu_shutdown: cumemFinalize failed! Reason=%s\n",
+                    cumemGetErrorString(status));
+            if(status == CUMEM_STATUS_CUDA_ERROR) {
+                fprintf(stderr, "  Cuda-Reason=%s\n",
+                        cudaGetErrorString(cudaGetLastError()));
+            }
+        }
+    }
+    cudaThreadExit();
    Py_INCREF(Py_None);
    return Py_None;
 }

--- a/theano/sandbox/cuda/cumem.cpp
+++ b/theano/sandbox/cuda/cumem.cpp
--- a/theano/sandbox/cuda/cumem.h
+++ b/theano/sandbox/cuda/cumem.h