Merge pull request #583 from lamblin/preallocated_out_ops

Fix ops to accept any kind of preallocated output memory

Merge pull request #583 from lamblin/preallocated_out_ops
6465523c · nouiz · 1ede3c8b · b17aefbc · 6465523c · 6465523c
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -30,7 +30,7 @@ class GpuDot22(GpuOp):
        return Apply(self, [x, y], [otype()])

    def c_code_cache_version(self):
-        return (1, 1)
+        return (1, 2)

    def c_code(self, node, nodename, inputs, outputs, sub):
        x, y = inputs
@@ -51,9 +51,14 @@ class GpuDot22(GpuOp):
            || (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
                  CudaNdarray_HOST_DIMS(%(x)s)[0])
            || (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
-                  CudaNdarray_HOST_DIMS(%(y)s)[1]))
+                  CudaNdarray_HOST_DIMS(%(y)s)[1])
+            || (CudaNdarray_HOST_STRIDES(%(z)s)[0] < 0)
+            || (CudaNdarray_HOST_STRIDES(%(z)s)[1] < 0)
+            || ((CudaNdarray_HOST_DIMS(%(z)s)[0] > 1)
+                && (CudaNdarray_HOST_STRIDES(%(z)s)[0] != 1)
+                && (CudaNdarray_HOST_DIMS(%(z)s)[1] > 1)
+                && (CudaNdarray_HOST_STRIDES(%(z)s)[1] != 1)))
        {
-            //if (%(z)s) Py_DECREF(%(z)s);
            Py_XDECREF(%(z)s);
            npy_intp dims[2];
            dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
@@ -108,7 +113,7 @@ class GpuDot22Scalar(GpuOp):
        return Apply(self, [x, y, a], [otype()])

    def c_code_cache_version(self):
-        return (1, 1)
+        return (1, 2)

    def c_code(self, node, name, inputs, outputs, sub):
        x, y, a = inputs
@@ -135,7 +140,13 @@ class GpuDot22Scalar(GpuOp):
            (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
              CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
            (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
-              CudaNdarray_HOST_DIMS(%(y)s)[1]))
+              CudaNdarray_HOST_DIMS(%(y)s)[1])
+            || (CudaNdarray_HOST_STRIDES(%(z)s)[0] < 0)
+            || (CudaNdarray_HOST_STRIDES(%(z)s)[1] < 0)
+            || ((CudaNdarray_HOST_DIMS(%(z)s)[0] > 1)
+                && (CudaNdarray_HOST_STRIDES(%(z)s)[0] != 1)
+                && (CudaNdarray_HOST_DIMS(%(z)s)[1] > 1)
+                && (CudaNdarray_HOST_STRIDES(%(z)s)[1] != 1)))
        {
            //if (%(z)s) Py_DECREF(%(z)s);
            Py_XDECREF(%(z)s);
@@ -790,7 +801,7 @@ class GpuDownsampleFactorMax(GpuOp):
    #def perform(self, node, input_storage, output_storage):
        #raise NotImplementedError('only C is implemented')
    def c_code_cache_version(self):
-        return (3)
+        return (4)

    def c_code(self, node, nodename, inp, out, sub):
        x, = inp
@@ -856,7 +867,11 @@ class GpuDownsampleFactorMax(GpuOp):
                CudaNdarray_HOST_STRIDES(%(x)s)[1],
                CudaNdarray_HOST_STRIDES(%(x)s)[2],
                CudaNdarray_HOST_STRIDES(%(x)s)[3],
-                CudaNdarray_DEV_DATA(%(z)s));
+                CudaNdarray_DEV_DATA(%(z)s),
+                CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(z)s)[1],
+                CudaNdarray_HOST_STRIDES(%(z)s)[2],
+                CudaNdarray_HOST_STRIDES(%(z)s)[3]);
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
            if( cudaSuccess != err)
@@ -883,7 +898,7 @@ class GpuDownsampleFactorMax(GpuOp):
        __global__ void kMaxPool_%(nodename)s(
           int D0, int D1, int D2, int D3, int xD2, int xD3,
           const float * x, int xS0, int xS1, int xS2, int xS3,
-           float *z)
+           float *z, int zS0, int zS1, int zS2, int zS3)
        {
            float cur_max, cur_x;
            int i0 = blockIdx.x %% D0;
@@ -932,7 +947,7 @@ class GpuDownsampleFactorMax(GpuOp):
            }

            //store the result to global memory
-            z[i0 * D1*D2*D3 + i1*D2*D3 + i2*D3 + threadIdx.x] = cur_max;
+            z[i0*zS0 + i1*zS1 + i2*zS2 + threadIdx.x*zS3] = cur_max;
        }
        """ % locals()


--- a/theano/sandbox/cuda/elemwise.py
+++ b/theano/sandbox/cuda/elemwise.py
@@ -36,8 +36,8 @@ class SupportCodeError(Exception):

 class NaiveAlgo(object):
    verbose = 0 # 1, 2 or 3 for more verbose output.
-    cache_version = ()
-    cache_version = (14, verbose)
+    #cache_version = ()
+    cache_version = (15, verbose)

    def __init__(self, scalar_op, sync=True, inplace_pattern={}):
        """
@@ -541,7 +541,7 @@ class NaiveAlgo(object):
        if nb_inputs > 0 and nd > 0:
            print >> sio, """
            int local_str[%(nb_inputs)s][%(nd)s];
-            int local_ostr[%(nb_inputs)s][%(nd)s];
+            int local_ostr[%(nb_outputs)s][%(nd)s];
            """ % locals()
        else:
            print >> sio, """
@@ -928,6 +928,11 @@ nd_collapse_[i]=0;
                %(oname)s = NULL;
            }
        }
+        if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
+        {
+            Py_XDECREF(%(oname)s);
+            %(oname)s = NULL;
+        }
        if (NULL == %(oname)s)
        {
            %(oname)s = (CudaNdarray*)CudaNdarray_New();

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -199,8 +199,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
    def make_node(self, dy, sm, y_idx):
        return Apply(self, [dy, sm, y_idx],[sm.type()])
    def c_code_cache_version(self):
-        return (4,)
        #return ()
+        return (5,)
    def c_code(self, node, nodename, inp, out, sub):
        dnll, sm, y_idx = inp
        dx, = out
@@ -257,7 +257,9 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
                        CudaNdarray_DEV_DATA(%(y_idx)s),
                        CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],

-                        CudaNdarray_DEV_DATA(%(dx)s)       //guaranteed c-contiguous
+                        CudaNdarray_DEV_DATA(%(dx)s),
+                        CudaNdarray_HOST_STRIDES(%(dx)s)[0],
+                        CudaNdarray_HOST_STRIDES(%(dx)s)[1]
                );
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
@@ -277,7 +279,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
           const float * dnll, const int dnll_s0,
           const float * sm, const int sm_s0, const int sm_s1,
           const float * y_idx, const int y_idx_s0,
-           float * dx)
+           float * dx, const int dx_s0, const int dx_s1)
        {
            for (int i = blockIdx.x; i < N; i += gridDim.x)
            {
@@ -288,14 +290,14 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
                {
                    if (y_i == j)
                    {
-                        dx[i * K + j] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
+                        dx[i * dx_s0 + j * dx_s1] = dnll_i * (sm[i * sm_s0 + j * sm_s1]-1.0);
                    }
                    else
                    {
-                        dx[i * K + j] = dnll_i * sm[i * sm_s0 + j * sm_s1];
+                        dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
                    }
-                    //dx[i * K + j] = dnll_i * sm[i * sm_s0 + j * sm_s1];
-                    //dx[i*K+j] = 0;
+                    //dx[i * dx_s0 + j * dx_s1] = dnll_i * sm[i * sm_s0 + j * sm_s1];
+                    //dx[i*dx_s0+j*dx_s1] = 0;
                }
            }
        }
@@ -319,7 +321,7 @@ class GpuSoftmax (GpuOp):
        return shape
    def c_code_cache_version(self):
        #return ()
-        return (4,) + inline_softmax.code_version
+        return (5,) + inline_softmax.code_version
    def c_code(self, node, nodename, inp, out, sub):
        x, = inp
        z, = out
@@ -364,7 +366,9 @@ class GpuSoftmax (GpuOp):
                        CudaNdarray_HOST_STRIDES(%(x)s)[0],
                        CudaNdarray_HOST_STRIDES(%(x)s)[1],

-                        CudaNdarray_DEV_DATA(%(z)s)  //guarantee c contig
+                        CudaNdarray_DEV_DATA(%(z)s),
+                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
                );
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
@@ -381,7 +385,7 @@ class GpuSoftmax (GpuOp):
        return nvcc_kernel("kSoftmax_%s"%nodename,
                params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
-                    'float * sm'],
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
@@ -393,7 +397,7 @@ class GpuSoftmax (GpuOp):
                      "__syncthreads()",
                      inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                        "sm[blockIDX * N + tx] = buf[tx]",# This set all value correctly
+                        "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",# This set all value correctly
                      "}",
                      "__syncthreads()",
                    "}",
@@ -419,7 +423,7 @@ class GpuSoftmaxWithBias (GpuOp):
        return  [shape[0]]
    def c_code_cache_version(self):
        #return ()
-        return (5,) + inline_softmax.code_version
+        return (6,) + inline_softmax.code_version

    def c_code(self, node, nodename, inp, out, sub):
        x, b = inp
@@ -481,7 +485,9 @@ class GpuSoftmaxWithBias (GpuOp):
                        CudaNdarray_DEV_DATA(%(b)s),
                        CudaNdarray_HOST_STRIDES(%(b)s)[0],

-                        CudaNdarray_DEV_DATA(%(z)s)  //guarantee c contig
+                        CudaNdarray_DEV_DATA(%(z)s),
+                        CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                        CudaNdarray_HOST_STRIDES(%(z)s)[1]
                    );
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
@@ -503,7 +509,7 @@ class GpuSoftmaxWithBias (GpuOp):
                params=['int M', 'int N',
                    'const float * x', 'const int sx0', 'const int sx1',
                    'const float * b', 'const int sb0',
-                    'float * sm'],
+                    'float * sm', 'const int ssm0', 'const int ssm1'],
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
@@ -516,7 +522,7 @@ class GpuSoftmaxWithBias (GpuOp):
                       "__syncthreads()",
                       inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x'),
                      "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
-                         "sm[blockIDX * N + tx] = buf[tx]",
+                         "sm[blockIDX * ssm0 + tx * ssm1] = buf[tx]",
                      "}",
                      "__syncthreads()",
                    "}",

--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -33,7 +33,7 @@ def test_GpuCrossentropySoftmaxArgmax1HotWithBias():
    batch_size = 4097
    n_out = 1250

-    if theano.config.mode != "DEBUG_MODE":
+    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        n_in = 4098
        n_out = 4099


--- a/theano/sandbox/multinomial.py
+++ b/theano/sandbox/multinomial.py
@@ -147,19 +147,21 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
        return Apply(self, [pvals, unis], [pvals.type()])

    def c_code_cache_version(self):
-        return (7,)
+        return (8,)

    def c_support_code_apply(self, node, nodename):
        return """
        static __global__ void k_multi_warp_%(nodename)s(
            const int nb_multi,
            const int nb_outcomes,
-            const int pvals_row_strides,
-            const int pvals_col_strides,
-            const int unis_stride,
            float * global_pvals,
+            const int pvals_row_stride,
+            const int pvals_col_stride,
            float * global_unis,
-            float * global_outs
+            const int unis_stride,
+            float * global_outs,
+            const int outs_row_stride,
+            const int outs_col_stride
        )
        {
            // each thread takes care of one multinomial draw
@@ -174,7 +176,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
                float current_out = 0.;
                if (!done)
                {
-                    cummul += global_pvals[m * pvals_col_strides + n * pvals_row_strides];
+                    cummul += global_pvals[m * pvals_col_stride + n * pvals_row_stride];
                    if (unis_n < cummul)
                    {
                        current_out = 1.;
@@ -182,7 +184,7 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
                    }
                }
                //write out transposed for speed.
-                global_outs[n + m * nb_multi] = current_out;
+                global_outs[n * outs_col_stride + m * outs_row_stride] = current_out;
            }
            }
        }
@@ -262,12 +264,14 @@ class GpuMultinomialFromUniform(MultinomialFromUniform, GpuOp):
            k_multi_warp_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                CudaNdarray_HOST_DIMS(%(z)s)[1],
                CudaNdarray_HOST_DIMS(%(z)s)[0],
+                CudaNdarray_DEV_DATA(%(pvals)s),
                CudaNdarray_HOST_STRIDES(%(pvals)s)[0],
                CudaNdarray_HOST_STRIDES(%(pvals)s)[1],
-                CudaNdarray_HOST_STRIDES(%(unis)s)[0],
-                CudaNdarray_DEV_DATA(%(pvals)s),
                CudaNdarray_DEV_DATA(%(unis)s),
-                CudaNdarray_DEV_DATA(%(z)s)
+                CudaNdarray_HOST_STRIDES(%(unis)s)[0],
+                CudaNdarray_DEV_DATA(%(z)s),
+                CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(z)s)[1]
            );
            CNDA_THREAD_SYNC;
            cudaError_t sts = cudaGetLastError();

--- a/theano/sandbox/neighbours.py
+++ b/theano/sandbox/neighbours.py
@@ -314,7 +314,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
                                      dtype=ten4.type.dtype)()])

    def c_code_cache_version(self):
-        return (7,)
+        return (8,)

    def c_support_code_apply(self, node, nodename):
        mode = self.mode
@@ -333,6 +333,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
            const int grid_d,
            const int stride0, const int stride1, const int stride2, const int stride3,
            float * global_ten4,
+            const int out_s0, const int out_s1,
            float * global_out
        )
        {
@@ -375,7 +376,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;

                                    int z_col = j + d * i;
-                                    int z_idx = z_col + c*d*z_row;
+                                    int z_idx = z_col * out_s1 + z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
@@ -395,6 +396,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
            const int grid_d,
            const int stride0, const int stride1, const int stride2, const int stride3,
            float * global_ten4,
+            const int out_s0, const int out_s1,
            float * global_out
        )
        {
@@ -437,7 +439,7 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
                                    int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n;

                                    int z_col = j + d * i;
-                                    int z_idx = z_col + c*d*z_row;
+                                    int z_idx = z_col * out_s1 + z_row * out_s0;
                                    global_out[z_idx] = global_ten4[ten4_idx];
                                }
                            }
@@ -573,7 +575,9 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
                      int, int, int ,int,
                      int, int,
                      int, int, int, int,
-                      float*, float*);
+                      float*,
+                      int, int,
+                      float*);
            if(n_threads.x==d && n_threads.y==c){
                f = k_multi_warp_less_%(name)s;
            }else{
@@ -591,6 +595,8 @@ class GpuImages2Neibs(Images2Neibs, GpuOp):
                CudaNdarray_HOST_STRIDES(%(ten4)s)[2],
                CudaNdarray_HOST_STRIDES(%(ten4)s)[3],
                CudaNdarray_DEV_DATA(%(ten4)s),
+                CudaNdarray_HOST_STRIDES(%(z)s)[0],
+                CudaNdarray_HOST_STRIDES(%(z)s)[1],
                CudaNdarray_DEV_DATA(%(z)s)
            );
            CNDA_THREAD_SYNC;

--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -239,7 +239,9 @@ class mrg_uniform(mrg_uniform_base):
        npy_intp odims[%(ndim)s];
        int n_elements = 1;
        int n_streams = 0;
-        int must_alloc_sample = ((NULL == %(o_sample)s) || (%(o_sample)s->nd != %(ndim)s));
+        int must_alloc_sample = ((NULL == %(o_sample)s)
+                                 || (%(o_sample)s->nd != %(ndim)s)
+                                 || !(PyArray_ISCONTIGUOUS(%(o_sample)s)));
        %(otype)s * sample_data;
        npy_int32 * state_data;

@@ -280,7 +282,6 @@ class mrg_uniform(mrg_uniform_base):
            n_elements *= odims[i];
            must_alloc_sample = must_alloc_sample || (%(o_sample)s->dimensions[i] != odims[i]);
            //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
-            // TODO CHECK STRIDES OF o_sample?
        }
        if (must_alloc_sample)
        {
@@ -370,9 +371,11 @@ class mrg_uniform(mrg_uniform_base):
            state_data_i[5]= x23;
        }
        //////// </ code generated by mrg_uniform>
-        """ %locals()
+        """ % locals()
+
    def c_code_cache_version(self):
-        return (1,)
+        return (2,)
+

 class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
    #GPU VERSION
@@ -496,6 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        int n_streams, n_streams_used_in_this_call;
        int must_alloc_sample = ((NULL == %(o_sample)s)
                || !CudaNdarray_Check(py_%(o_sample)s)
+                || !CudaNdarray_is_c_contiguous(%(o_sample)s)
                || (%(o_sample)s->nd != %(ndim)s));

        if (%(size)s->nd != 1)
@@ -590,7 +594,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
        //////// </ code generated by mrg_uniform>
        """ %locals()
    def c_code_cache_version(self):
-        return (5,)
+        return (6,)


 def guess_n_streams(size, warn=True):

--- a/theano/sparse/basic.py
+++ b/theano/sparse/basic.py
@@ -1847,6 +1847,9 @@ class StructuredDotGradCSC(gof.Op):
                g_a_data[i_idx] = dot_val
        out[0] = g_a_data

+    def c_code_cache_version(self):
+        return (1,)
+
    def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):

        if node.inputs[2].type.dtype in ('complex64', 'complex128'):
@@ -1870,17 +1873,13 @@ class StructuredDotGradCSC(gof.Op):
        if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
        {PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}

-        if (!%(_zout)s)
+        if (!%(_zout)s
+            || (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]))
        {
+            Py_XDECREF(%(_zout)s);
            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
        }

-        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
-        {
-            PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
-            %(fail)s;
-        }
-
        {   //makes it compile even though labels jump over variable definitions.
            npy_intp nnz = %(_indices)s->dimensions[0];
            npy_intp N =  %(_indptr)s->dimensions[0]-1; //TODO: error checking with this
@@ -1971,6 +1970,9 @@ class StructuredDotGradCSR(gof.Op):
                g_a_data[j_idx] = dot_val
        out[0] = g_a_data

+    def c_code_cache_version(self):
+        return (1,)
+
    def c_code(self, node, name, (_indices, _indptr, _d, _g), (_zout, ), sub):

        if node.inputs[2].type.dtype in ('complex64', 'complex128'):
@@ -1994,17 +1996,13 @@ class StructuredDotGradCSR(gof.Op):
        if( %(_d)s->dimensions[1] != %(_g)s->dimensions[1])
        {PyErr_SetString(PyExc_NotImplementedError, "d and g have different numbers of columns"); %(fail)s;}

-        if (!%(_zout)s)
+        if (!%(_zout)s
+            || (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0]))
        {
+            Py_XDECREF(%(_zout)s);
            %(_zout)s = (PyArrayObject*) PyArray_SimpleNew(1, %(_indices)s->dimensions, %(_g)s->descr->type_num);
        }

-        if (%(_zout)s->dimensions[0] != %(_indices)s->dimensions[0])
-        {
-            PyErr_SetString(PyExc_NotImplementedError, "somehow _zout got the wrong size.. and I don't know how to resize it.");
-            %(fail)s;
-        }
-
        {   //makes it compile even though labels jump over variable definitions.
            npy_intp nnz = %(_indices)s->dimensions[0];
            // extract number of rows
@@ -2411,8 +2409,6 @@ class UsmmCscDense(gof.Op):
            npy_intp K = %(y)s->dimensions[0];

            // pointers to access actual data in the arrays passed as params.
-            dtype_%(z)s* __restrict__ Dz   = (dtype_%(z)s*)%(z)s->data;
-            dtype_%(zn)s* __restrict__ Dzn   = (dtype_%(zn)s*)%(zn)s->data;
            const dtype_%(x_val)s* __restrict__ Dval = (dtype_%(x_val)s*)%(x_val)s->data;
            const npy_int32 * __restrict__ Dind = (npy_int32*)%(x_ind)s->data;
            const npy_int32 * __restrict__ Dptr = (npy_int32*)%(x_ptr)s->data;
@@ -2428,7 +2424,11 @@ class UsmmCscDense(gof.Op):

            if (!(%(inplace)s))
            {
-                memcpy(Dzn, Dz, M*N*sizeof(dtype_%(zn)s));
+                if (PyArray_CopyInto(%(zn)s, %(z)s))
+                {
+                    Py_XDECREF(%(zn)s);
+                    %(fail)s;
+                }
            }

            for (npy_int32 k = 0; k < K; ++k)
@@ -2439,9 +2439,16 @@ class UsmmCscDense(gof.Op):

                    const dtype_%(x_val)s Amk = alpha * Dval[m_idx * Sval]; // actual value at that location

-                    const dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
+                    dtype_%(y)s* y_row = (dtype_%(y)s*)(%(y)s->data + %(y)s->strides[0] * k);
+                    // axpy expects pointer to the beginning of memory arrays,
+                    // so when the stride is negative, we need to get the
+                    // last element
+                    if (Sy < 0)
+                        y_row += (K - 1) * Sy;

-                    const dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
+                    dtype_%(zn)s* z_row = (dtype_%(zn)s*)(%(zn)s->data + %(zn)s->strides[0] * m);
+                    if (Szn < 0)
+                        z_row += (N - 1) * Szn;

                    %(axpy)s((int*)&N, (%(conv_type)s*)&Amk, (%(conv_type)s*)y_row, (int*)&Sy, (%(conv_type)s*)z_row, (int*)&Szn);
                }
@@ -2451,6 +2458,9 @@ class UsmmCscDense(gof.Op):

        return rval

+    def c_code_cache_version(self):
+        return (1,)
+

 usmm_csc_dense = UsmmCscDense(inplace=False)
 usmm_csc_dense_inplace = UsmmCscDense(inplace=True)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -2849,40 +2849,47 @@ class Alloc(gof.Op):
            out[0][...] = v  # broadcast v to fill us up

    def c_code(self, node, name, inp, out, sub):
-        # TODO: use the elemwise code generator here
-        if python_all(node.inputs[0].broadcastable):
-            # filling with a scalar is a common use of alloc
-            # that we can implement relatively easily
-            vv = inp[0]
-            zz, = out
-            fail = sub['fail']
-            if node.outputs[0].ndim == 1:
-                N0 = inp[1]
-                return """
-                npy_intp N0 = ((dtype_%(N0)s*)%(N0)s->data)[0];
-                dtype_%(vv)s vv;
-                dtype_%(zz)s* zz;
-                if ((NULL == %(zz)s) || (%(zz)s->dimensions[0] != N0))
-                {
-                    if (%(zz)s) Py_XDECREF(%(zz)s);
-                    %(zz)s = (PyArrayObject*)PyArray_SimpleNew(1,
-                        &N0, type_num_%(vv)s);
-                    if(!%(zz)s) {
-                        PyErr_SetString(PyExc_MemoryError, "alloc failed");
-                        %(fail)s
-                    }
-                }
-                vv = ((dtype_%(vv)s*)%(vv)s->data)[0];
-                zz = ((dtype_%(zz)s*)%(zz)s->data);
-                assert (%(zz)s->strides[0] == sizeof(dtype_%(zz)s));
-                for (int i = 0; i < N0; ++i)
+        vv = inp[0]
+        ndim = len(inp[1:])
+        zz, = out
+        fail = sub['fail']
+
+        code = """
+            npy_intp shape[%(ndim)s];
+            """ % dict(ndim=ndim)
+
+        # Initialize shape
+        for i, shp_i in enumerate(inp[1:]):
+            code += """
+                shape[%(i)s] = ((dtype_%(shp_i)s*) %(shp_i)s->data)[0];
+                """ % dict(i=i, shp_i=shp_i)
+
+        code += """
+            int need_new_out = (NULL == %(zz)s);
+            for (int i = 0; i < %(ndim)s; i++)
+                need_new_out = (need_new_out
+                                || (%(zz)s->dimensions[i] != shape[i]));
+
+            if (need_new_out)
+            {
+                Py_XDECREF(%(zz)s);
+                %(zz)s = (PyArrayObject*) PyArray_SimpleNew(%(ndim)s,
+                    shape, type_num_%(vv)s);
+                if (!%(zz)s)
                {
-                    zz[i] = vv;
+                    PyErr_SetString(PyExc_MemoryError, "alloc failed");
+                    %(fail)s
                }
-                """ % locals()
+            }
+
+            // This function takes care of broadcasting
+            PyArray_CopyInto(%(zz)s, %(vv)s);
+            """ % dict(vv=vv, ndim=ndim, zz=zz, fail=fail)

-        # else pretend this never happened
-        return super(Alloc, self).c_code(node, name, inp, out, sub)
+        return code
+
+    def c_code_cache_version(self):
+        return (1,)

    def infer_shape(self, node, input_shapes):
        return [node.inputs[1:]]

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -845,9 +845,15 @@ class Gemm(GemmRelated):
    setup_z_Nz_Sz_outplace = """
        if ((NULL == %(_zout)s)
            || (%(_zout)s->dimensions[0] != %(_z)s->dimensions[0])
-            || (%(_zout)s->dimensions[1] != %(_z)s->dimensions[1]))
+            || (%(_zout)s->dimensions[1] != %(_z)s->dimensions[1])
+            || (%(_zout)s->strides[0] <= 0)
+            || (%(_zout)s->strides[1] <= 0)
+            || (%(_zout)s->strides[0] MOD type_size)
+            || (%(_zout)s->strides[1] MOD type_size)
+            || ((%(_zout)s->strides[0] != type_size)
+                && (%(_zout)s->strides[1] != type_size)))
        {
-            if (%(_zout)s) Py_XDECREF(%(_zout)s);
+            Py_XDECREF(%(_zout)s);
            npy_intp dims[2];
            dims[0] = %(_z)s->dimensions[0];
            dims[1] = %(_z)s->dimensions[1];
@@ -862,42 +868,44 @@ class Gemm(GemmRelated):
        }
        Nz = %(_zout)s->dimensions;
        Sz = %(_zout)s->strides;
-        if (1) // COPY z -> zout
+
+        if (%(_zout)s->descr->type_num == PyArray_FLOAT)
        {
-            if (%(_zout)s->descr->type_num == PyArray_FLOAT)
+            float * zoutdata = (float*)%(_zout)s->data;
+            int zoi = Sz[0] / sizeof(float);
+            int zoj = Sz[1] / sizeof(float);
+            const float * zdata = (float*)%(_z)s->data;
+            int zi = %(_z)s->strides[0]/sizeof(float);
+            int zj = %(_z)s->strides[1]/sizeof(float);
+            for (int i = 0; i < Nz[0]; ++i)
            {
-                float * zoutdata = (float*)%(_zout)s->data;
-                const float * zdata = (float*)%(_z)s->data;
-                int zi = %(_z)s->strides[0]/sizeof(float);
-                int zj = %(_z)s->strides[1]/sizeof(float);
-                for (int i = 0; i < Nz[0]; ++i)
+                for (int j = 0; j < Nz[1]; ++j)
                {
-                    for (int j = 0; j < Nz[1]; ++j)
-                    {
-                        zoutdata[i*Nz[1]+j] = zdata[zi*i+zj*j];
-                    }
+                    zoutdata[zoi*i + zoj*j] = zdata[zi*i + zj*j];
                }
            }
-            else if (%(_zout)s->descr->type_num == PyArray_DOUBLE)
+        }
+        else if (%(_zout)s->descr->type_num == PyArray_DOUBLE)
+        {
+            double * zoutdata = (double*) %(_zout)s->data;
+            int zoi = Sz[0] / sizeof(double);
+            int zoj = Sz[1] / sizeof(double);
+            const double * zdata = (double*)%(_z)s->data;
+            int zi = %(_z)s->strides[0]/sizeof(double);
+            int zj = %(_z)s->strides[1]/sizeof(double);
+            for (int i = 0; i < Nz[0]; ++i)
            {
-                double * zoutdata = (double*) %(_zout)s->data;
-                const double * zdata = (double*)%(_z)s->data;
-                int zi = %(_z)s->strides[0]/sizeof(double);
-                int zj = %(_z)s->strides[1]/sizeof(double);
-                for (int i = 0; i < Nz[0]; ++i)
+                for (int j = 0; j < Nz[1]; ++j)
                {
-                    for (int j = 0; j < Nz[1]; ++j)
-                    {
-                        zoutdata[i*Nz[1]+j] = zdata[zi*i+zj*j];
-                    }
+                    zoutdata[zoi*i + zoj*j] = zdata[zi*i + zj*j];
                }
            }
-            else
-            {
-                PyErr_SetString(PyExc_AssertionError,
-                                "neither float nor double dtype");
-                %(fail)s
-            }
+        }
+        else
+        {
+            PyErr_SetString(PyExc_AssertionError,
+                            "neither float nor double dtype");
+            %(fail)s
        }
        """

@@ -938,7 +946,7 @@ class Gemm(GemmRelated):
    def c_code_cache_version(self):
        gv = self.build_gemm_version()
        if gv:
-            return (3,) + gv
+            return (4,) + gv
        else:
            return gv


--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
@@ -12,7 +12,7 @@ class ConvTransp3D(theano.Op):
        return hash(type(self))

    def c_code_cache_version(self):
-        return (2,)
+        return (3,)

    def make_node(self, W, b, d, H, RShape = None):
        """
@@ -232,23 +232,8 @@ class ConvTransp3D(theano.Op):
                                       }
                                   }

-                                   for (int i = 0; i < 3; i++)
-                                       if (%(R)s->strides[i] < %(R)s->strides[4])
-                                       {
-                                           PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)");
-                                           %(fail)s
-                                       }
-
                                   { // for fail 6

-
-                                       memset(%(R)s->data, 0,  (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
-                                          (videoHeight-1) * %(R)s->strides[1] +
-                                          (videoWidth-1)  * %(R)s->strides[2] +
-                                          (videoDur-1)    * %(R)s->strides[3]);
-
-
-
                                       #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
                                       #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )