Merge pull request #3169 from nouiz/GpuAdvancedIncSubtensor1_dev20

[WIP, BUG, CRASH] Fix GpuAdvancedIncSubtensor1_dev20 with negative index

Merge pull request #3169 from nouiz/GpuAdvancedIncSubtensor1_dev20
f40bf432 · Frédéric Bastien · cd2272fe · 1f5f607b · f40bf432 · f40bf432
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return Apply(self, [x_, y_, ilist_], [x_.type()])
    def c_code_cache_version(self):
-        return (3,)
+        return (6,)
    def c_code(self, node, name, inputs, outputs, sub):
        active_device_no = theano.sandbox.cuda.active_device_number()
@@ -3023,15 +3023,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
                                          int stridesY1,
                                          float *Y ,
                                          long *d_indices_arr,
-                                          int num)
+                                          int num,
+                                          int* err)
        {
             for (int i = (blockIdx.x); i < num; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
                      int x_row = d_indices_arr[i];
+                      if(x_row < 0)
+                          x_row += numRowsX;
                      int y_row = i;
-                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
+                      if(x_row < numRowsX && x_row >= 0){
+                        atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
+                                  Y[(y_row * stridesY0) + (j * stridesY1)]);
+                      } else {
+                        *err = 1;
+                      }
                  }
             }
             return;
@@ -3039,64 +3047,78 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
            CudaNdarray* py_other, PyArrayObject *indices_arr)
-	{
+        {
-     		const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
+            if(init_err_var()!= 0) return -1;
-     		const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
-     		const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
+            const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
-     		const int *strY   = CudaNdarray_HOST_STRIDES(py_other);
+            const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
-     		unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
+            const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
-                if(size == 0){
+            const int *strY   = CudaNdarray_HOST_STRIDES(py_other);
-                    return 0;
+            unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
-                }
+            if(size == 0){
-     		unsigned int numcolsX = shapeX[1];
-     		unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-     		unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
-     		dim3 n_blocks(num_blocks);
-     		dim3 n_threads(num_threads_per_block);
-     		long *d_indices_arr = NULL;
-     		PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
-     		d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
-                if(!d_indices_arr)
-                    return -1;
-     		cudaError_t err = cudaMemcpy(d_indices_arr,
-                                             PyArray_DATA(cpu_indices_arr),
-                                             PyArray_NBYTES(cpu_indices_arr),
-                                             cudaMemcpyHostToDevice);
-                if(err != cudaSuccess){
-                    PyErr_Format(
-                        PyExc_RuntimeError,
-                        "GpuAdvancedIncSubtensor1_dev20: cudaMemcpy returned an error: %%s",
-                        cudaGetErrorString(err));
-                    return -1;
-                }
-     		k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
-                                                           shapeX[1],
-                                                           strX[0],
-                                                           strX[1],
-                                                           CudaNdarray_DEV_DATA(py_self),
-                                                           shapeY[0],
-                                                           shapeY[1],
-                                                           strY[0],
-                                                           strY[1],
-                                                           CudaNdarray_DEV_DATA(py_other),
-                                                           d_indices_arr,
-                                                           PyArray_SIZE(indices_arr)
-                                                          );
-                device_free(d_indices_arr);
-                Py_XDECREF(cpu_indices_arr);
-                err = cudaGetLastError();
-                if(err != cudaSuccess){
-                    PyErr_Format(
-                        PyExc_RuntimeError,
-                        "GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
-                        cudaGetErrorString(err));
-                    return -1;
-                }
                return 0;
+            }
+            unsigned int numcolsX = shapeX[1];
+            unsigned int num_threads_per_block = std::min(
+                numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
+            unsigned int num_blocks = std::min(
+                size, (unsigned int)NUM_VECTOR_OP_BLOCKS);
+            dim3 n_blocks(num_blocks);
+            dim3 n_threads(num_threads_per_block);
+            long *d_indices_arr = NULL;
+            PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(
+                indices_arr);
+            d_indices_arr = (long*)device_malloc(
+                PyArray_NBYTES(cpu_indices_arr));
+            if(!d_indices_arr)
+                return -1;
+            cudaError_t err = cudaMemcpy(d_indices_arr,
+                                         PyArray_DATA(cpu_indices_arr),
+                                         PyArray_NBYTES(cpu_indices_arr),
+                                         cudaMemcpyHostToDevice);
+            if(err != cudaSuccess){
+                PyErr_Format(
+                    PyExc_RuntimeError,
+                    "GpuAdvancedIncSubtensor1_dev20:"
+                    " cudaMemcpy returned an error: %%s",
+                    cudaGetErrorString(err));
+                return -1;
+            }
+            k_vector_add_fast<<<n_blocks, n_threads>>>(
+                shapeX[0],
+                shapeX[1],
+                strX[0],
+                strX[1],
+                CudaNdarray_DEV_DATA(py_self),
+                shapeY[0],
+                shapeY[1],
+                strY[0],
+                strY[1],
+                CudaNdarray_DEV_DATA(py_other),
+                d_indices_arr,
+                PyArray_SIZE(indices_arr),
+                err_var
+            );
+            int index_err = check_err_var();
+            device_free(d_indices_arr);
+            Py_XDECREF(cpu_indices_arr);
+            if(index_err != 0) return -1;
+            err = cudaGetLastError();
+            if(err != cudaSuccess){
+                PyErr_Format(
+                    PyExc_RuntimeError,
+                    "GpuAdvancedIncSubtensor1_dev20: cuda error: %%s",
+                    cudaGetErrorString(err));
+                return -1;
+            }
+            return 0;
        }
        """ % locals()

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -27,22 +27,8 @@
 //if you want this to work.
 #define PRECHECK_ERROR 0
-//If true, we release the GIL around blocking GPU calls, to allow other Python
-//threads to run in the meantime. For a single-threaded program, the overhead
-//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
-//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
-#ifndef RELEASE_GIL
-#define RELEASE_GIL 1
-#endif
-#if RELEASE_GIL
-#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
-#else
-#define CNDA_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS
-#endif
 cublasHandle_t handle = NULL;
+int* err_var = NULL;
 /////////////////////////
 // Alloc and Free
@@ -976,13 +962,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
    }
 }
-// Pointor to 1 int on the device
-// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
-// When it is allocated, it should always be 0
-// So if there is an error, we must reset it to 0 BEFORE we raise the error
-// This prevent us from setting it to 0 before each use
-static int* err_var = NULL;
 // We try to be similar to the PyArray_TakeFrom function
 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
 //TODO: support other clip mode then raise(clip, wrap)
@@ -1163,30 +1142,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    k3 = k_take_3<CPY>;
    // Create the memory place that will store the error information.
-    if (err_var == NULL) {
+    if(init_err_var() != 0) return NULL;
-        err_var = (int*)device_malloc(sizeof(int));
-        if (!err_var) { // PyErr set by device_malloc
-            Py_DECREF(indices);
-            Py_DECREF(out);
-            free(dims);
-            return NULL;
-        }
-        cudaError_t err = cudaMemset((void*)err_var, 0, sizeof(int));
-        if (cudaSuccess != err) {
-            // Clear the error flag, cudaMemset doesn't do it.
-            // Currently this returns the same thing as err, but if in future
-            // it returns something else I still don't see why we should ignore
-            // it.  All we want to do here is reset the flag.
-            cudaGetLastError();
-            PyErr_Format(PyExc_RuntimeError,
-                         "Error setting device error code to 0. %s",
-                         cudaGetErrorString(err));
-            Py_DECREF(indices);
-            Py_DECREF(out);
-            free(dims);
-            return NULL;
-        }
-    }
    dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
    if(CudaNdarray_HOST_DIMS(out)[0] == 0){
@@ -1298,47 +1254,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
        Py_DECREF(out);
        return NULL;
    }
-    //-10 could be any value different then 0.
-    int cpu_err_var=-10;
-    CNDA_BEGIN_ALLOW_THREADS
+    int index_err = check_err_var();
-    // As we execute cudaMemcpy on the default stream, it waits for all
+    Py_DECREF(indices);
-    // kernels (on all streams) to be finished before starting to copy
+    if (index_err != 0) {
-    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
-                     cudaMemcpyDeviceToHost);
-    CNDA_END_ALLOW_THREADS
-    if (cudaSuccess != err) {
-        PyErr_Format(
-            PyExc_RuntimeError,
-            "Cuda error: %s: %s when trying to get the error value.\n",
-            "CudaNdarray_TakeFrom",
-            cudaGetErrorString(err));
-        Py_DECREF(indices);
-        Py_DECREF(out);
-        return NULL;
-    }
-    if (cpu_err_var != 0) {
-        PyErr_Format(
-            PyExc_IndexError,
-            "CudaNdarray_TakeFrom: One of the index value is out of bound. Error code: %i.\n",
-            cpu_err_var);
-        // Must reset it to 0 to don't reset it before each use.
-        err = cudaMemset((void*)err_var, 0, sizeof(int));
-        if (cudaSuccess != err) {
-            PyErr_Format(PyExc_MemoryError, "Error setting device error code to 0 after having an index error. %s", cudaGetErrorString(err));
-            Py_DECREF(indices);
-            Py_DECREF(out);
-            return NULL;
-        }
-        Py_DECREF(indices);
        Py_DECREF(out);
        return NULL;
    }
-    Py_DECREF(indices);
    if (verbose) printf("TAKE SUCCEDED\n");
    return (PyObject *)out;
 }

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -76,6 +76,21 @@ typedef float real;
 #define CNDA_THREAD_SYNC cudaThreadSynchronize();
 #endif
+//If true, we release the GIL around blocking GPU calls, to allow other Python
+//threads to run in the meantime. For a single-threaded program, the overhead
+//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
+//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
+#ifndef RELEASE_GIL
+#define RELEASE_GIL 1
+#endif
+#if RELEASE_GIL
+#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
+#else
+#define CNDA_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS
+#endif
 #ifndef SHARED_SIZE
 #define SHARED_SIZE (16*1024)
@@ -101,6 +116,76 @@ DllExport void * device_malloc(size_t size, int verbose);
 DllExport int device_free(void * ptr);
 DllExport void *get_work_mem(size_t sz);
+// Pointor to 1 int on the device
+// Used in CudaNdarray_TakeFrom and in an op
+// to tell that there is an out of bound error
+// When it is allocated, it should always be 0
+// So if there is an error, we must reset it to 0 BEFORE we raise the error
+// This prevent us from setting it to 0 before each use
+extern DllExport int* err_var;
+static inline int init_err_var(){
+    if (err_var == NULL) {
+        err_var = (int*)device_malloc(sizeof(int));
+        if (!err_var) { // PyErr set by device_malloc
+            return -1;
+        }
+        cudaError_t err = cudaMemset((void*)err_var, 0,
+                                     sizeof(int));
+        if (cudaSuccess != err) {
+            // Clear the error flag, cudaMemset doesn't do it.
+            cudaGetLastError();
+            PyErr_Format(
+                PyExc_RuntimeError,
+                "Error setting device error code to 0. %s",
+                cudaGetErrorString(err));
+            return -1;
+        }
+    }
+    return 0;
+}
+static inline int check_err_var(){
+    //-10 could be any value different then 0.
+    int cpu_err_var=-10;
+    cudaError_t err;
+    CNDA_BEGIN_ALLOW_THREADS
+    // As we execute cudaMemcpy on the default stream, it waits
+    // for all kernels (on all streams) to be finished before
+    // starting to copy
+    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
+                     cudaMemcpyDeviceToHost);
+    CNDA_END_ALLOW_THREADS
+    if (cudaSuccess != err) {
+        PyErr_Format(
+            PyExc_RuntimeError,
+            "Cuda error: %s when trying to get the error"
+            " value.\\n",
+            cudaGetErrorString(err));
+        return -1;
+    }
+    if (cpu_err_var != 0) {
+        PyErr_Format(
+            PyExc_IndexError,
+            "One of the index value is out of bound. Error code: %i.\\n",
+            cpu_err_var);
+        // Must reset it to 0 to don't reset it before each use.
+        err = cudaMemset((void*)err_var, 0, sizeof(int));
+        if (cudaSuccess != err) {
+            PyErr_Format(PyExc_MemoryError,
+                "Error setting device error code to 0 after having"
+                " an index error. %s", cudaGetErrorString(err));
+            return -1;
+        }
+        return -1;
+    }
+    return 0;
+}
 template <typename T>
 static T ceil_intdiv(T a, T b)
 {

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node):
            return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
-    if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \
+    if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
-       node.inputs[0].dtype == "float32":
+            node.inputs[0].dtype == "float32" and
+            node.inputs[1].dtype == "float32"):
        x, y = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -489,7 +489,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
    def c_code_cache_version(self):
-        return (3,)
+        return (4,)
    def c_headers(self):
        return ['cuda.h', '<gpuarray/extension.h>', '<numpy_compat.h>',
@@ -587,6 +587,8 @@ __device__ npy_float16 atomicAdd(npy_float16 *addr, npy_float16 val) {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
                      int x_row = indices_arr[i * stridesIndices];
+                      if(x_row < 0)
+                          x_row += numRowsX;
                      int y_row = i;
                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
                  }

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -437,7 +437,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
    def test_ok_list(self):
        for data, idx in [(rand(4), [1, 0]),
-                          (rand(4, 5), [2, 3]),
+                          (rand(4, 5), [2, 3, -1]),
                          (rand(4, 2, 3), [0, 3]),
                          (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0]),
                          (rand(4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
@@ -479,6 +479,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                out2 = test_out[0][0]
                assert out1 is out2
+            # test the grad
+            gn = theano.grad(t.sum(), n)
+            g = self.function([], gn, op=self.adv_incsub1)
+            utt.verify_grad(lambda m: m[[1, 3]],
+                            [numpy.random.rand(5, 5).astype(self.dtype)])
+            g_0 = g()
+            utt.verify_grad(lambda m: m[idx],
+                            [data])
    def test_err_invalid_list(self):
        n = self.shared(numpy.asarray(5, dtype=self.dtype))
        self.assertRaises(TypeError, n.__getitem__, [0, 0])
@@ -495,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
        f = self.function([l], t, op=self.adv_sub1)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
+        # the grad
-             self.ignore_topo)]
+        g = self.function([l],
-        assert len(topo_) == 1
+                          inc_subtensor(t, numpy.asarray([[1.]], self.dtype)),
-        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
+                          op=self.adv_incsub1)
        for shp in [[0, 4], [0, -3], [-10]]:
            self.assertRaises(IndexError, f, shp)
+            self.assertRaises(IndexError, g, shp)
    def test_adv_sub1_broadcast(self):
        ones = numpy.ones((1, 3), dtype=self.dtype)