Add tests for different strides.

Fix one problem related to strides by copying the strides infos from the input to the output.

Add tests for different strides.
14713c9d · Marc-Alexandre Cote · 3454ea2d · 14713c9d · 14713c9d
--- a/theano/sandbox/cuda/extra_ops.py
+++ b/theano/sandbox/cuda/extra_ops.py
@@ -75,7 +75,7 @@ class GpuCumsum(CumsumOp, GpuOp):
                                                     compute_map, no_recycling)
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
    def c_support_code_apply(self, node, nodename):
        return """
@@ -289,12 +289,21 @@ class GpuCumsum(CumsumOp, GpuOp):
                    if (CudaNdarray_HOST_DIMS(%(x)s)[i] == CudaNdarray_HOST_DIMS(%(z)s)[i]) {
                        needAllocation = true;
                    }
+                    if (CudaNdarray_HOST_STRIDES(%(x)s)[i] == CudaNdarray_HOST_STRIDES(%(z)s)[i]) {
+                        needAllocation = true;
+                    }
                }
            }
            if (needAllocation){
                Py_XDECREF(%(z)s);
                %(z)s = (CudaNdarray*) CudaNdarray_NewDims(CudaNdarray_NDIM(%(x)s), shape);
+                // Copy strides information
+                for (int i= 0; i < CudaNdarray_NDIM(%(x)s); ++i) {
+                    CudaNdarray_set_stride(%(z)s, i, CudaNdarray_HOST_STRIDES(%(x)s)[i]);
+                }
            }
            if (!%(z)s) {

--- a/theano/sandbox/cuda/tests/test_extra_ops.py
+++ b/theano/sandbox/cuda/tests/test_extra_ops.py
@@ -44,6 +44,19 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
        self.max_threads_dim0 = prop['maxThreadsDim0']
        self.max_grid_size1 = prop['maxGridSize1']
+    def test_Strides1D(self):
+        x = T.vector('x')
+        # Stepped strides
+        f = theano.function([x], cumsum(x[::2]))
+        a = np.random.randint(10, size=(42,)).astype(config.floatX)
+        assert np.allclose(np.cumsum(a[::2]), f(a))
+        # Negative strides
+        f = theano.function([x], cumsum(x[::-1]))
+        a = np.random.randint(10, size=(42,)).astype(config.floatX)
+        assert np.allclose(np.cumsum(a[::-1]), f(a))
    def test_GpuCumsum1D(self):
        block_max_size = self.max_threads_dim0 * 2