Use provided output memory if not contiguous

It may be slower than allocated new memory and calling memset, but if that non-contiguous memory was provided, it is probable that the output will end up being copied into that non-contiguous memory, and that would be even slower.

Use provided output memory if not contiguous
137ce6a3 · Pascal Lamblin · 10aa713a · 137ce6a3
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2721,7 +2721,7 @@ class GpuAlloc(GpuOp):
        str += "if(%(out)s==NULL\n" % locals()
        for idx, sh in enumerate(shps):
            str += "||CudaNdarray_HOST_DIMS(%(out)s)[%(idx)s]!=dims[%(idx)s]" % locals()
-        str += """||(%(memset_0)s && !CudaNdarray_is_c_contiguous(%(out)s))){
+        str += """){
            Py_XDECREF(%(out)s);
            %(out)s = (CudaNdarray*)CudaNdarray_New();
            if (!%(out)s)
@@ -2737,7 +2737,7 @@ class GpuAlloc(GpuOp):
                %(fail)s;
            }
        }
-        if (%(memset_0)s)
+        if (%(memset_0)s && CudaNdarray_is_c_contiguous(%(out)s))
        {
            if (cudaSuccess != cudaMemset(%(out)s->devdata, 0,
                                          CudaNdarray_SIZE(%(out)s) * 4))
@@ -2769,7 +2769,7 @@ class GpuAlloc(GpuOp):
        return [None for i in inputs]

    def c_code_cache_version(self):
-        return (6,)
+        return (7,)

    def do_constant_folding(self, node):
        for client in node.outputs[0].clients: