added copy on negative strides

904a523d · Kelvin Xu · Kelvin Xu · 013bc89d · 904a523d · 904a523d
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3407,6 +3407,76 @@ class GpuAlloc(GpuOp):
 gpu_alloc = GpuAlloc()


+class CopyOnNegativeStrides(GpuOp):
+    """
+    Checks if the input has contains negative strides. If it
+    does, returns a c contiguous copy.
+    """
+    view_map = {0: [0]}
+    check_input = False
+    __props__ = ()
+
+    def grad(self, inputs, dout):
+
+        x, = inputs
+        dout, = dout
+        dout = as_cuda_ndarray_variable(dout)
+
+        return [dout]
+
+    def make_node(self, input):
+        input = as_cuda_ndarray_variable(input)
+        return Apply(self, [input], [input.type()])
+
+    def perform(self, node, inp, out):
+        i = inp[0]
+        if any(s < 0 for s in i.strides):
+            i = i.copy()
+        out[0][0] = i
+
+    def c_code(self, node, name, inp, out, sub):
+        input, = inp
+        z, = out
+        fail = sub['fail']
+        str = """
+        {
+            bool strides_all_positive = true;
+            for (int i = 0; i < CudaNdarray_NDIM(%(input)s); i++){
+                if (CudaNdarray_HOST_STRIDES(%(input)s)[i] < 0){
+                    strides_all_positive = false;
+                    break;
+                }
+            }
+            if (strides_all_positive){
+                Py_XDECREF(%(z)s);
+                %(z)s = %(input)s;
+                Py_INCREF(%(z)s);
+
+            } else if ((NULL == %(z)s)""" % locals()
+        for i in xrange(node.inputs[0].type.ndim):
+            str += "\n|| (CudaNdarray_HOST_DIMS(%(input)s)[%(i)s] != CudaNdarray_HOST_DIMS(%(z)s)[%(i)s])" % locals()
+        str += """
+                || !CudaNdarray_is_c_contiguous(%(z)s))
+            {
+                Py_XDECREF(%(z)s);
+                %(z)s = (CudaNdarray*)CudaNdarray_Copy(%(input)s);
+                if (!%(z)s)
+                {
+                    %(fail)s;
+                }
+            }else if(CudaNdarray_CopyFromCudaNdarray(%(z)s,%(input)s)){
+                %(fail)s;
+            }
+        }
+        """ % locals()
+        return str
+
+    def c_code_cache_version(self):
+        return (0,)
+
+cp_on_negative_strides = CopyOnNegativeStrides()
+
+
 class GpuContiguous(GpuOp):
    """
    Always return a c contiguous output. Copy the input only if it is

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -14,7 +14,8 @@ from theano.tensor.basic import ShapeError
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import GpuOp
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
-                                           gpu_contiguous, HostFromGpu)
+                                           gpu_contiguous, HostFromGpu,
+                                           cp_on_negative_strides)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
                                      GpuDownsampleFactorMaxGrad)
 from theano.sandbox.cuda.nnet import GpuSoftmax
@@ -630,7 +631,9 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
        return GpuDnnConvGradI()(kerns, img, desc, shape2, shape3)

    # Standard case: We use GpuDnnConv with suitable padding.
-    img = gpu_contiguous(img)
+    # cp_on_negative_strides will return a gpu_contiguous copy
+    # if the img contains negative strides
+    img = cp_on_negative_strides(img)
    kerns = gpu_contiguous(kerns)
    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
                          conv_mode=conv_mode)(img.shape, kerns.shape)