Merge pull request #2075 from nouiz/gpu_inc_sub_crash

[CRASH] fix crash on the GPU with {inc,set}_subtensor and broadcasting t...

Merge pull request #2075 from nouiz/gpu_inc_sub_crash
a1eef12d · Pascal Lamblin · c85d1953 · 607bf90c · a1eef12d · a1eef12d
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2888,7 +2888,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
            returns a C code expression to copy source into view, and
            return 0 on success
        """
-        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
+        # On the CPU it unbroadcast based on the run time shapes. We
+        # need the same behavior on the GPU.
+        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s, 1)""" % locals()
    def add_to_zview(self, name, x, fail):
@@ -2910,7 +2912,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    def c_code_cache_version(self):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if parent_version:
-            return parent_version + (0,)
+            return parent_version + (1,)
        return ()

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1002,7 +1002,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            return NULL;
        indices = (CudaNdarray*) CudaNdarray_New();
-        if (verbose) printf("ndarray after new\n");
+        if (verbose) printf("\nndarray after new\n");
        if (! indices){
            Py_DECREF(indices_float32);
            return NULL;
@@ -1140,6 +1140,13 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    }
    dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
+    if(CudaNdarray_HOST_DIMS(out)[0] == 0){
+        // We take 0 elements, so no need for the rest of the code.
+        // This speed up that case AND fix crash otherwise.
+        free(dims);
+        Py_DECREF(indices);
+        return (PyObject *)out;
+    }
    switch (self->nd) {
        case 1:
@@ -1149,7 +1156,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                    printf("cudaGetLastError=%d, nd=%d"
                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
-                           self->nd, cudaGetLastError(),
+                           cudaGetLastError(), self->nd,
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0],
@@ -1205,7 +1212,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                    printf("cudaGetLastError=%d, nd=%d"
                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
-                           self->nd, cudaGetLastError(),
+                           cudaGetLastError(), self->nd,
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0], //dimensions

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -967,6 +967,8 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
        # version when we should. Users should not use it.
        for shape, idx, fast in [((70000,), range(70000), True),
                                 ((70000, 5), range(70000), True),
+                                 ((70000, 5),  numpy.zeros((0,), 'int64'),
+                                  True),
                                 ((70000, 2, 3), range(70000), True),
                                 ((1025, 1025), [5, 10], True),
                                 ((3, 1025, 1026), [1, 2], True),

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -863,7 +863,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            inc_slice(2, 1),
            (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),))
-    def test_advanced_inc_and_set(self):
+    def test_inc_and_set_subtensor(self):
+        """
+        Test increment and set with broadcast
+        """
+        X = tensor.matrix(dtype=self.dtype)
+        y = set_subtensor(X[1::, 1::],  0)
+        f = self.function([X], [y],
+                          op=self.inc_sub,
+                          N=1)
+        x_ = numpy.ones((9, 9))
+        out = f(x_.astype('float32'))
+        res = x_.copy()
+        res[1::, 1::] = 0
+        assert numpy.allclose(out, res)
+    def test_advanced1_inc_and_set(self):
        """
        Test advanced increment and set.
        """