Merge pull request #1966 from abergeron/cublas_v2

Switch to cublas v2

Merge pull request #1966 from abergeron/cublas_v2
0f07b4a5 · Frédéric Bastien · a9553206 · bcf62111 · 0f07b4a5 · 0f07b4a5
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -41,6 +41,8 @@
 #define CNDA_END_ALLOW_THREADS
 #endif
+cublasHandle_t handle;
 /////////////////////////
 // Alloc and Free
 /////////////////////////
@@ -537,14 +539,15 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
    npy_intp rval_size = PyArray_SIZE(rval);
    void *rval_data = PyArray_DATA(rval);
+    cublasStatus_t err;
    CNDA_BEGIN_ALLOW_THREADS
-    cublasGetVector(rval_size, sizeof(real),
+    err = cublasGetVector(rval_size, sizeof(real),
                          contiguous_self->devdata, 1,
                          rval_data, 1);
    //CNDA_THREAD_SYNC;  // unneeded because cublasGetVector is blocking anyway
    CNDA_END_ALLOW_THREADS
-    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+    if (CUBLAS_STATUS_SUCCESS != err)
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
        Py_DECREF(rval);
@@ -3007,7 +3010,7 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
                            "CudaNdarray_ptr_int_size: Can't allocate memory on the gpu.");
    }
    get_gpu_ptr_size<<<1,1>>>(gpu_data);
-    if (cudaSuccess != cublasGetError()){
+    if (cudaSuccess != cudaGetLastError()){
        device_free(gpu_data);
        return PyErr_Format(PyExc_RuntimeError,
@@ -3016,16 +3019,19 @@ CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
    // Transfer the result to cpu
    int gpu_sizes[] = {-1,-1};
-    cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
+    cublasStatus_t err;
+    err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
    device_free(gpu_data);
-    if (CUBLAS_STATUS_SUCCESS != cublasGetError()){
+    if (CUBLAS_STATUS_SUCCESS != err){
        PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
        return NULL;
    }
    return Py_BuildValue("iiii", gpu_sizes[0], sizeof(float*), sizeof(int), gpu_sizes[1]);
 }
+static int cublas_init();
+static int cublas_shutdown();
 // Initialize the gpu.
 // Takes one optional parameter, the device number.
 // If provided, it sets that device to be the active device.
@@ -3092,6 +3098,11 @@ CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
        }
    }
+    // Initialize cublas
+    if (handle != NULL)
+        cublas_shutdown();
+    cublas_init();
    Py_INCREF(Py_None);
    return Py_None;
 }
@@ -3535,22 +3546,28 @@ CudaNdarray_New(int nd)
 //
 //////////////////////////////
-int
+static int
 cublas_init()
 {
-    cublasInit();
+    if (CUBLAS_STATUS_SUCCESS != cublasCreate(&handle))
-    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error initializing device");
        return -1;
    }
+    // Set the default stream as the one to execute on (default)
+    cublasSetStream(handle, NULL);
+    // Pointer to scalars are on the host (also default)
+    cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+    // atomics can be used in kernels to speed up operations (not default)
+    // This may lead to a slight variance from run to run in some operations
+    cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
    return 0;
 }
-int
+static int
 cublas_shutdown()
 {
-    cublasShutdown();
+    if (CUBLAS_STATUS_SUCCESS != cublasDestroy(handle))
-    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error shutting down device");
        return -1;
@@ -3579,14 +3596,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
    }
    npy_intp py_src_size = PyArray_SIZE(py_src);
    void *py_src_data = PyArray_DATA(py_src);
+    cublasStatus_t cerr;
    CNDA_BEGIN_ALLOW_THREADS
-    cublasSetVector(py_src_size,
+    cerr = cublasSetVector(py_src_size,
                           sizeof(real),
                           py_src_data, 1,
                           self->devdata, 1);
    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
    CNDA_END_ALLOW_THREADS
-    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+    if (CUBLAS_STATUS_SUCCESS != cerr)
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
        Py_DECREF(py_src);
@@ -3750,11 +3768,12 @@ int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
        if (verbose)
            fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
-        cublasScopy(size, CudaNdarray_DEV_DATA(other), 1,
+        cublasStatus_t err;
+        err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
                          CudaNdarray_DEV_DATA(self), 1);
        CNDA_THREAD_SYNC;
        Py_XDECREF(new_other);
-        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        if (CUBLAS_STATUS_SUCCESS != err)
        {
            PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
            return -1;
@@ -3920,22 +3939,6 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        return -1;
    }
-#if PRECHECK_ERROR
-    cublasStatus prevErr = cublasGetError();
-    if (CUBLAS_STATUS_SUCCESS != prevErr)
-    {
-        //I don't know why, but I need to remove the cuda error too.
-        //Otherwise, the clean up before raising the Python error cause error too!
-        //So we don't see this python error.
-        fprintf(stderr,
-                "CudaNdarray_sgemm: Prev cublas error %s",
-                cublasGetErrorString(prevErr));
-        PyErr_Format(PyExc_RuntimeError,
-                     "CudaNdarray_sgemm: Prev cublas error %s",
-                     cublasGetErrorString(prevErr));
-        return -1;
-    }
-#endif
    // We must allow dimensions to be zeros.
    if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
            || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
@@ -4055,8 +4058,8 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    float* a = CudaNdarray_DEV_DATA(A);
    float* b = CudaNdarray_DEV_DATA(B);
    float* c = CudaNdarray_DEV_DATA(C);
-    char N = 'N';
+    cublasOperation_t N = CUBLAS_OP_N;
-    char T = 'T';
+    cublasOperation_t T = CUBLAS_OP_T;
    //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
    // There should be no negative stride at that point
 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
@@ -4064,7 +4067,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if (sy == 0){sy = 1;}\
    if (sz == 0){sz = 1;}\
    if ((sx > 0) && (sy > 0) && (sz > 0)) { \
-        cublasSgemm(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz); \
+        err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
    } else { \
        PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
        Py_XDECREF(A_new);\
@@ -4072,6 +4075,7 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
        return -1; \
    }
+    cublasStatus_t err;
    switch(unit)
    {
        case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
@@ -4089,12 +4093,11 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    Py_XDECREF(A_new);
    Py_XDECREF(B_new);
-    cublasStatus err = cublasGetError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {
        PyErr_Format(PyExc_RuntimeError,
                     "cublasSgemm failed (%i) %s\n"
-                     " unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%f, b=%f, c=%f"
+                     " unit=%h N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
                     " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
                     err,  cublasGetErrorString(err),
                     unit, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1],
@@ -4187,29 +4190,31 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    if (sa_1 == 0)
        sa_1 = 1;
+    // This is important because we can end up not calling Sgemv at all
+    cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
    if (CudaNdarray_SIZE(C)) {
        if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
            || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
                && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
        {
-            cublasSgemv('N',
+            err = cublasSgemv(handle, CUBLAS_OP_N,
                    CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
-                    alpha,
+                    &alpha,
                    CudaNdarray_DEV_DATA(A), sa_1,
                    CudaNdarray_DEV_DATA(B), sb_0,
-                    beta,
+                    &beta,
                    CudaNdarray_DEV_DATA(C), sc_0);
        }
        else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
                || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
                    && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
        {
-            cublasSgemv('T',
+            err = cublasSgemv(handle, CUBLAS_OP_T,
                    CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
-                    alpha,
+                    &alpha,
                    CudaNdarray_DEV_DATA(A), sa_0,
                    CudaNdarray_DEV_DATA(B), sb_0,
-                    beta,
+                    &beta,
                    CudaNdarray_DEV_DATA(C), sc_0);
        }
        else
@@ -4235,7 +4240,6 @@ int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B,
    Py_XDECREF(A_new);
    Py_XDECREF(B_new);
-    cublasStatus err = cublasGetError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {
        PyErr_Format(PyExc_RuntimeError,
@@ -4303,13 +4307,15 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
    int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
                                                 : CudaNdarray_HOST_DIMS(A)[0];
+    // This is important because we can end up not calling Sger at all
+    cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
    if(CudaNdarray_SIZE(A)){
        // If A is in col-major
        if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
            || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
                && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
        {
-            cublasSger(CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], alpha,
+            err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
                       CudaNdarray_DEV_DATA(x), x_strides,
                       CudaNdarray_DEV_DATA(y), y_strides,
                       CudaNdarray_DEV_DATA(A), sa_1);
@@ -4319,7 +4325,7 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
                || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
                    && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
        {
-            cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
+            err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
                       CudaNdarray_DEV_DATA(y), y_strides,
                       CudaNdarray_DEV_DATA(x), x_strides,
                       CudaNdarray_DEV_DATA(A), sa_0);
@@ -4338,7 +4344,6 @@ int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y,
    Py_XDECREF(x_new);
    Py_XDECREF(y_new);
-    cublasStatus err = cublasGetError();
    if (CUBLAS_STATUS_SUCCESS != err)
    {
        PyErr_Format(PyExc_RuntimeError,
@@ -4973,14 +4978,12 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
            }
        }
    }
-    cublasSetVector(cnda_structure_size(self->nd),
+    if (cublasSetVector(cnda_structure_size(self->nd),
                        sizeof(int),
                        self->host_structure,
                        1,
                        self->dev_structure,
-                    1);
+                        1) != CUBLAS_STATUS_SUCCESS)
-    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
-    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
        return -1;

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -40,7 +40,7 @@
 #endif
-#include <cublas.h>
+#include <cublas_v2.h>
 #ifdef _WIN32
 #ifdef _CUDA_NDARRAY_C
@@ -81,6 +81,9 @@ typedef float real;
 #define VERBOSE_DEVICE_MALLOC 1
 #define NO_VERBOSE_DEVICE_MALLOC 0
+/* Use this handle to make cublas calls */
+extern cublasHandle_t handle;
 /**
 * Allocation and freeing of device memory should go through these functions so that the lib can track memory usage.
 *
@@ -365,8 +368,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
            //Detect overflow on unsigned integer
            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested",
+                             "Can't store in size_t for the bytes requested %llu",
-                             size);
+                             (unsigned long long)size);
                return -1;
            }
            size = size * dim[i];
@@ -382,8 +385,8 @@ static int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd,
            //Detect overflow on unsigned integer
            if (dim[i] != 0 && size > (SIZE_MAX / dim[i])) {
                PyErr_Format(PyExc_AssertionError,
-                             "Can't store in size_t for the bytes requested",
+                             "Can't store in size_t for the bytes requested %llu",
-                             size);
+                             (unsigned long long)size);
                return -1;
            }
            size = size * dim[i];
@@ -583,23 +586,33 @@ DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_othe
 DllExport int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
                                      const int * dims, int fortran = 0);
-DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus err){
+DllExport inline const char* ALWAYS_INLINE cublasGetErrorString(cublasStatus_t err){
-    if(CUBLAS_STATUS_SUCCESS == err)
+    switch(err) {
+    case CUBLAS_STATUS_SUCCESS:
        return "success";
-    else if(CUBLAS_STATUS_NOT_INITIALIZED == err)
+    case CUBLAS_STATUS_NOT_INITIALIZED:
        return "the library was not initialized";
-    else if(CUBLAS_STATUS_ALLOC_FAILED == err)
+    case CUBLAS_STATUS_ALLOC_FAILED:
        return "the resource allocation failed";
-    else if(CUBLAS_STATUS_INVALID_VALUE == err)
+    case CUBLAS_STATUS_INVALID_VALUE:
        return "the parameters n<0 or incx,incy=0";
-    else if(CUBLAS_STATUS_MAPPING_ERROR == err)
+#ifdef CUBLAS_STATUS_ARCH_MISMATCH
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "required device feature not present";
+#endif
+    case CUBLAS_STATUS_MAPPING_ERROR:
        return "an access to GPU memory space failed";
-    else if(CUBLAS_STATUS_EXECUTION_FAILED == err)
+    case CUBLAS_STATUS_EXECUTION_FAILED:
        return "the function failed to launch on the GPU";
-    else if(CUBLAS_STATUS_INTERNAL_ERROR == err)
+    case CUBLAS_STATUS_INTERNAL_ERROR:
        return "an internal operation failed";
-    else
+#ifdef CUBLAS_STATUS_NOT_SUPPORTED
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+        return "unsupported function";
+#endif
+    default:
        return "unknow code";
+    }
 }
 #endif