Merge pull request #3219 from nouiz/2g

cuda bugfix with array over 2g

Merge pull request #3219 from nouiz/2g
bb533b5e · abergeron · 9a653e3e · d8543295 · bb533b5e · bb533b5e
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -625,17 +625,20 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)

    npy_intp rval_size = PyArray_SIZE(rval);
    void *rval_data = PyArray_DATA(rval);
-    cublasStatus_t err;
-    CNDA_BEGIN_ALLOW_THREADS
-    err = cublasGetVector(rval_size, sizeof(real),
-                          contiguous_self->devdata, 1,
-                          rval_data, 1);
-    //CNDA_THREAD_SYNC;  // unneeded because cublasGetVector is blocking anyway
-    CNDA_END_ALLOW_THREADS
+    cudaError_t err;
+    CNDA_BEGIN_ALLOW_THREADS;

-    if (CUBLAS_STATUS_SUCCESS != err)
+    err = cudaMemcpy(rval_data, contiguous_self->devdata,
+                     rval_size * sizeof(real),
+                     cudaMemcpyDeviceToHost
+                     );
+    //CNDA_THREAD_SYNC;  // unneeded because cudaMemcpy is blocking anyway
+    CNDA_END_ALLOW_THREADS;
+
+    if (cudaSuccess != err)
    {
-        PyErr_SetString(PyExc_RuntimeError, "error copying data to host");
+        PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
+                     cudaGetErrorString(err));
        Py_DECREF(rval);
        rval = NULL;
    }
@@ -3754,20 +3757,19 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
    }
    npy_intp py_src_size = PyArray_SIZE(py_src);
    void *py_src_data = PyArray_DATA(py_src);
-    cublasStatus_t cerr;
-    CNDA_BEGIN_ALLOW_THREADS
-    cerr = cublasSetVector(py_src_size,
-                           sizeof(real),
-                           py_src_data, 1,
-                           self->devdata, 1);
-    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
-    CNDA_END_ALLOW_THREADS
-    if (CUBLAS_STATUS_SUCCESS != cerr)
+    cudaError_t cerr;
+    CNDA_BEGIN_ALLOW_THREADS;
+    cerr = cudaMemcpy(self->devdata, py_src_data,
+                      py_src_size * sizeof(real),
+                      cudaMemcpyHostToDevice);
+    //CNDA_THREAD_SYNC;  // unneeded because cudaMemcpy is blocking anyway
+    CNDA_END_ALLOW_THREADS;
+    if (cudaSuccess != cerr)
    {
        PyErr_Format(PyExc_RuntimeError,
-                     "CUBLAS error '%s' while copying %lli data element"
+                     "Cuda error '%s' while copying %lli data element"
                     " to device memory",
-                     cublasGetErrorString(cerr),
+                     cudaGetErrorString(cerr),
                     (long long)py_src_size);
        Py_DECREF(py_src);
        return -1;

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -813,10 +813,10 @@ class Subtensor(Op):

                assert (slicelength <= length);

-                xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
-                       %(strides_mul)s;
+                xview_offset += (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii]
+                    * start * %(strides_mul)s;
                xview_dims[inner_ii] = slicelength;
-                xview_strides[inner_ii] = %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;
+                xview_strides[inner_ii] = (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step;

                inner_ii += 1;
                spec_pos += 3;
@@ -829,7 +829,7 @@ class Subtensor(Op):
                {
                    if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
                    {
-                        xview_offset += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
+                        xview_offset += (npy_intp)%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
                               %(strides_mul)s;
                    }
                    else
@@ -863,7 +863,7 @@ class Subtensor(Op):

    @staticmethod
    def helper_c_code_cache_version():
-        return (8,)
+        return (9,)

    def c_code(self, node, name, inputs, outputs, sub):  # DEBUG
        if not isinstance(node.inputs[0].type, theano.tensor.TensorType):