cuda - added new device_malloc and device_free wrappers that count device allocations

163deb4b · James Bergstra · 93c8954c · 163deb4b · 163deb4b
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -6,6 +6,45 @@

 #include "cuda_ndarray.cuh"

+/////////////////////////
+// Alloc and Free
+/////////////////////////
+
+/**
+ *
+ * In the test program I'm using, the _outstanding_mallocs decreases with every call.
+ * This suggests there are more free() calls being made than alloc(), but I can't figure out why.
+ *
+ */
+int _outstanding_mallocs = 0;
+void * device_malloc(size_t size)
+{
+    void * rval=NULL;
+    if (cudaSuccess != cudaMalloc(&rval, size))
+    {
+        fprintf(stderr, "Error allocating %i bytes of device memory.\n", size);
+        PyErr_Format(PyExc_MemoryError, "error allocating %i bytes of device memory", size);
+        return NULL;
+    }
+    _outstanding_mallocs += (rval != NULL);
+    return rval;
+}
+int device_free(void *ptr)
+{
+    if (cudaSuccess != cudaFree(ptr))
+    {
+        PyErr_Format(PyExc_MemoryError, "error freeing device pointer %p", ptr);
+        return -1;
+    }
+    _outstanding_mallocs -= (ptr != NULL);
+    return 0;
+}
+static PyObject *
+outstanding_mallocs(PyObject* self, PyObject * args)
+{
+    return PyInt_FromLong(_outstanding_mallocs);
+}
+
 /////////////////////////
 // Static helper methods
 /////////////////////////
@@ -28,8 +67,7 @@ CudaNdarray_uninit(CudaNdarray*self)
    int rval = 0;
    if (self->data_allocated) {
        assert(self->devdata);
-        cublasFree(self->devdata);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        if (device_free(self->devdata))
        {
            std::cerr << "!!!! error freeing device memory\n";
            rval = -1;
@@ -39,8 +77,7 @@ CudaNdarray_uninit(CudaNdarray*self)
    }
    if (self->dev_structure)
    {
-        cublasFree(self->dev_structure);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        if (device_free(self->dev_structure))
        {
            std::cerr << "!!!! error freeing device memory\n";
            rval = -1;
@@ -269,6 +306,8 @@ PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
    {
        return NULL;
    }
+    //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
+    //      and use it if it is big enough.
    int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
    int * sum_dims = dimshuffle_pattern + len;
    int n_remaining_dims = 0;
@@ -1453,7 +1492,7 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
                return NULL;
            }
        }
-        if (CudaNdarray_Check(storage))
+        if (storage && CudaNdarray_Check(storage))
        {
            rval = (CudaNdarray*) storage;
            Py_INCREF(rval);
@@ -1462,14 +1501,17 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
        {
            rval = (CudaNdarray*) CudaNdarray_new_null();
        }
-        if (CudaNdarray_CopyFromArray(rval, data))
+        if (rval)
        {
-            Py_DECREF(rval);
-            rval = NULL;
+            if (CudaNdarray_CopyFromArray(rval, data))
+            {
+                Py_DECREF(rval);
+                rval = NULL;
+            }
+            Py_DECREF(data);
+            Py_DECREF(py_data);
+            Py_DECREF(broadcastable);
        }
-        Py_DECREF(data);
-        Py_DECREF(py_data);
-        Py_DECREF(broadcastable);
        return (PyObject*)rval;
    }
 }
@@ -1478,6 +1520,7 @@ static PyMethodDef module_methods[] = {
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
    {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable.  strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},    
+    {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
    {NULL, NULL, NULL, NULL}  /* Sentinel */
 };

@@ -1670,10 +1713,8 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray *
    if (self->data_allocated)
    {
        assert(self->devdata);
-        cublasFree(self->devdata);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        if (device_free(self->devdata))
        {
-            PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
            self->devdata = NULL;
            self->data_allocated = 0;
            return -1;

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -30,6 +30,15 @@ typedef float real;
 #define SHARED_SIZE (16*1024)
 #endif

+/**
+ * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
+ *
+ * device_malloc will set the Python error message before returning None.
+ * device_free will return nonzero on failure (after setting the python error message)
+ */
+void * device_malloc(size_t size);
+int device_free(void * ptr);
+
 template <typename T>
 static T ceil_intdiv(T a, T b)
 {
@@ -248,10 +257,8 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
    {
        if (self->dev_structure)
        {
-            cublasFree(self->dev_structure);
-            if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+            if (device_free(self->dev_structure))
            {
-                PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
                return -1;
            }
            self->dev_structure = NULL;
@@ -272,14 +279,17 @@ int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
            PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
            return -1;
        }
-        cublasAlloc(cnda_structure_size(nd), sizeof(int), (void**)&self->dev_structure);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        int struct_size = cnda_structure_size(nd);
+        if (struct_size)
        {
-            PyErr_SetString(PyExc_MemoryError, "error allocating device memory");
-            free(self->host_structure);
-            self->host_structure = NULL;
-            self->dev_structure = NULL;
-            return -1;
+            self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
+            if (NULL == self->dev_structure)
+            {
+                free(self->host_structure);
+                self->host_structure = NULL;
+                self->dev_structure = NULL;
+                return -1;
+            }
        }
        self->nd = nd;
        self->dev_structure_fresh = 0;
@@ -317,20 +327,15 @@ int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype

    if (self->data_allocated != size)
    {
-        //std::cerr << "resizing from  " << self->data_allocated << " to " << size << '\n';
-        cublasFree(self->devdata);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
-        {// Does this ever happen??  Do we need to set data_allocated or devdata to 0?
-            PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
+        if (device_free(self->devdata))
+        {
+            // Does this ever happen??  Do we need to set data_allocated or devdata to 0?
            return -1;
        }
        assert(size>0);
-        cublasAlloc(size, sizeof(real), (void**)&(self->devdata));
-        //std::cerr << "cublasAlloc returned " << self->devdata << "\n";
-        //We must do both checks as the first one is not enough in some cases!
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError() || !self->devdata)
+        self->devdata = (float*)device_malloc(size*sizeof(real));
+        if (!self->devdata)
        {
-            PyErr_Format(PyExc_MemoryError, "error allocating %i bytes device memory",size);
            CudaNdarray_set_nd(self,-1);
            self->data_allocated = 0;
            self->devdata = 0;