Make GpuAdvancedIncSubtensor1_dev20 return user index error

574e7965 · Frederic · ebbaae5b · 574e7965 · 574e7965 · 574e7965
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return Apply(self, [x_, y_, ilist_], [x_.type()])
    def c_code_cache_version(self):
-        return (4,)
+        return (5,)
    def c_code(self, node, name, inputs, outputs, sub):
        active_device_no = theano.sandbox.cuda.active_device_number()
@@ -3023,7 +3023,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
                                          int stridesY1,
                                          float *Y ,
                                          long *d_indices_arr,
-                                          int num)
+                                          int num,
+                                          int* err)
        {
             for (int i = (blockIdx.x); i < num; i += gridDim.x)
             {
@@ -3033,7 +3034,12 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
                      if(x_row < 0)
                          x_row += numRowsX;
                      int y_row = i;
-                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
+                      if(x_row < numRowsX && x_row >= 0){
+                        atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)],
+                                  Y[(y_row * stridesY0) + (j * stridesY1)]);
+                      } else {
+                        *err = 1;
+                      }
                  }
             }
             return;
@@ -3042,6 +3048,23 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
            CudaNdarray* py_other, PyArrayObject *indices_arr)
        {
+                if (err_var == NULL) {
+                    err_var = (int*)device_malloc(sizeof(int));
+                    if (!err_var) { // PyErr set by device_malloc
+                        return -1;
+                    }
+                    cudaError_t err = cudaMemset((void*)err_var, 0,
+                                                 sizeof(int));
+                    if (cudaSuccess != err) {
+                        // Clear the error flag, cudaMemset doesn't do it.
+                        cudaGetLastError();
+                        PyErr_Format(
+                            PyExc_RuntimeError,
+                            "Error setting device error code to 0. %%s",
+                            cudaGetErrorString(err));
+                        return -1;
+                    }
+                }
     		const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
     		const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
     		const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
@@ -3086,8 +3109,47 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
                                                           strY[1],
                                                           CudaNdarray_DEV_DATA(py_other),
                                                           d_indices_arr,
-                                                           PyArray_SIZE(indices_arr)
+                                                           PyArray_SIZE(indices_arr),
+                                                           err_var
                                                          );
+                //-10 could be any value different then 0.
+                int cpu_err_var=-10;
+                CNDA_BEGIN_ALLOW_THREADS
+                // As we execute cudaMemcpy on the default stream, it waits
+                // for all kernels (on all streams) to be finished before
+                // starting to copy
+                err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
+                                 cudaMemcpyDeviceToHost);
+                CNDA_END_ALLOW_THREADS
+                if (cudaSuccess != err) {
+                    PyErr_Format(
+                        PyExc_RuntimeError,
+                        "Cuda error: %%s: %%s when trying to get the error"
+                        " value.\\n",
+                        "GpuAdvancedIncSubtensor1_dev20",
+                        cudaGetErrorString(err));
+                    return NULL;
+                }
+                if (cpu_err_var != 0) {
+                    PyErr_Format(
+                        PyExc_IndexError,
+                        "GpuAdvancedIncSubtensor1_dev20: One of the index"
+                        " value is out of bound. Error code: %%i.\\n",
+                        cpu_err_var);
+                    // Must reset it to 0 to don't reset it before each use.
+                    err = cudaMemset((void*)err_var, 0, sizeof(int));
+                    if (cudaSuccess != err) {
+                        PyErr_Format(PyExc_MemoryError,
+                            "Error setting device error code to 0 after having"
+                            " an index error. %%s", cudaGetErrorString(err));
+                        return -1;
+                    }
+                    return -1;
+                }
                device_free(d_indices_arr);
                Py_XDECREF(cpu_indices_arr);
                err = cudaGetLastError();

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -27,21 +27,6 @@
 //if you want this to work.
 #define PRECHECK_ERROR 0
-//If true, we release the GIL around blocking GPU calls, to allow other Python
-//threads to run in the meantime. For a single-threaded program, the overhead
-//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
-//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
-#ifndef RELEASE_GIL
-#define RELEASE_GIL 1
-#endif
-#if RELEASE_GIL
-#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
-#else
-#define CNDA_BEGIN_ALLOW_THREADS
-#define CNDA_END_ALLOW_THREADS
-#endif
 cublasHandle_t handle = NULL;
 /////////////////////////
@@ -976,13 +961,6 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
    }
 }
-// Pointor to 1 int on the device
-// Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
-// When it is allocated, it should always be 0
-// So if there is an error, we must reset it to 0 BEFORE we raise the error
-// This prevent us from setting it to 0 before each use
-static int* err_var = NULL;
 // We try to be similar to the PyArray_TakeFrom function
 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
 //TODO: support other clip mode then raise(clip, wrap)

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -76,6 +76,21 @@ typedef float real;
 #define CNDA_THREAD_SYNC cudaThreadSynchronize();
 #endif
+//If true, we release the GIL around blocking GPU calls, to allow other Python
+//threads to run in the meantime. For a single-threaded program, the overhead
+//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
+//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
+#ifndef RELEASE_GIL
+#define RELEASE_GIL 1
+#endif
+#if RELEASE_GIL
+#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
+#else
+#define CNDA_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS
+#endif
 #ifndef SHARED_SIZE
 #define SHARED_SIZE (16*1024)
@@ -101,6 +116,14 @@ DllExport void * device_malloc(size_t size, int verbose);
 DllExport int device_free(void * ptr);
 DllExport void *get_work_mem(size_t sz);
+// Pointor to 1 int on the device
+// Used in CudaNdarray_TakeFrom and in an op
+// to tell that there is an out of bound error
+// When it is allocated, it should always be 0
+// So if there is an error, we must reset it to 0 BEFORE we raise the error
+// This prevent us from setting it to 0 before each use
+static int* err_var = NULL;
 template <typename T>
 static T ceil_intdiv(T a, T b)
 {

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1016,8 +1016,9 @@ def local_gpu_advanced_incsubtensor1(node):
            return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
-    if node.op.__class__ is tensor.AdvancedIncSubtensor1 and \
+    if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
-       node.inputs[0].dtype == "float32":
+        node.inputs[0].dtype == "float32" and
+        node.inputs[1].dtype == "float32"):
        x, y = node.inputs[0:2]
        coords = node.inputs[2:]
        go_gpu = False

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -482,7 +482,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            # test the grad
            gn = theano.grad(t.sum(), n)
            g = self.function([], gn, op=self.adv_incsub1)
-            theano.printing.debugprint(g)
            utt.verify_grad(lambda m: m[[1, 3]],
                            [numpy.random.rand(5, 5).astype(self.dtype)])
            g_0 = g()
@@ -505,13 +504,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        self.assertTrue(isinstance(t.owner.op, tensor.AdvancedSubtensor1))
        f = self.function([l], t, op=self.adv_sub1)
-        topo = f.maker.fgraph.toposort()
-        topo_ = [node for node in topo if not isinstance(node.op,
+        # the grad
-             self.ignore_topo)]
+        g = self.function([l],
-        assert len(topo_) == 1
+                          inc_subtensor(t, numpy.asarray([[1.]], self.dtype)),
-        self.assertTrue(isinstance(topo_[0].op, self.adv_sub1))
+                          op=self.adv_incsub1)
        for shp in [[0, 4], [0, -3], [-10]]:
            self.assertRaises(IndexError, f, shp)
+            self.assertRaises(IndexError, g, shp)
    def test_adv_sub1_broadcast(self):
        ones = numpy.ones((1, 3), dtype=self.dtype)