Merge pull request #1710 from nouiz/gpu_reduce

new Gpu reduce based on the version in Theano that is much faster.

Merge pull request #1710 from nouiz/gpu_reduce
06cdb1fa · abergeron · 262f59a2 · 1121fe45 · 06cdb1fa · 06cdb1fa
--- a/doc/extending/ctype.txt
+++ b/doc/extending/ctype.txt
@@ -553,3 +553,20 @@ respectively. See an example for the type ``CudaNdarrayType`` (GPU
 array) in the file `theano/sandbox/cuda/type.py`. The version
 parameter is what is returned by ViewOp.c_code_cache_version(). By
 default, it will recompile the c code for each process.
+Shape and Shape_i
+=================
+We have 2 generic Ops Shape and Shape_i that return the shape of any
+Theano Variable that have a shape attribute and Shape_i return only of
+the element of the shape.
+.. code-block:: python
+   theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
+   theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
+The c code work as the ViewOp. Shape_i have the additional i parameter
+that you can use with %(i)s.
--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
 from theano.compile.ops import (
        DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code,
+        Shape, shape, register_shape_c_code,
        Shape_i, register_shape_i_c_code,
        ViewOp, view_op, register_view_op_c_code)

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -181,6 +181,94 @@ class DeepCopyOp(gof.Op):
 deep_copy_op = DeepCopyOp()
+def register_shape_c_code(type, code, version=()):
+    """ Tell Shape Op how to generate C code for a Theano Type
+    :param typ: A Theano type. It must be the Theano class itself and not an
+                instance of the class.
+    :param code: C code that deep copies the Theano type 'typ'.
+                 Use %(iname)s and %(oname)s for the input and output C
+                 variable names respectively.
+    :param version: A number indicating the version of the code, for cache.
+    """
+    Shape.c_code_and_version[type] = (code, version)
+class Shape(gof.Op):
+    """
+    L{Op} to return the shape of a matrix.
+    @note: Non-differentiable.
+    """
+    # Mapping from Type to C code (and version) to use.
+    # In the C code, the name of the input variable is %(iname)s,
+    # the output variable is %(oname)s.
+    c_code_and_version = {}
+    def __hash__(self):
+        return hash(type(self))
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        # Must work for all type that have a shape attribute.
+        # This will fail at execution time.
+        if not isinstance(x, theano.Variable):
+            x = theano.tensor.as_tensor_variable(x)
+        return gof.Apply(self, [x], [theano.tensor.lvector()])
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        out[0] = theano._asarray(x.shape, dtype='int64')
+    def infer_shape(self, node, in_shapes):
+        return [[len(in_shapes[0])]]
+    def connection_pattern(self, node):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [[False]]
+    def grad(self, inp, grads):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [DisconnectedType()()]
+    def R_op(self, inputs, eval_points):
+        return [None]
+    def c_code(self, node, name, inames, onames, sub):
+        iname, = inames
+        oname, = onames
+        fail = sub['fail']
+        itype = node.inputs[0].type.__class__
+        if itype in self.c_code_and_version:
+            code, version = self.c_code_and_version[itype]
+            return code % locals()
+        # Else, no C code
+        return super(Shape, self).c_code(node, name, inames, onames, sub)
+    def c_code_cache_version(self):
+        return (1,)
+shape = Shape()
+_shape = shape  # was used in the past, now use shape directly.
+#pprint.assign(_shape, printing.MemberPrinter('shape'))
 class Shape_i(gof.Op):
    """
    L{Op} to return the shape of a matrix.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -523,21 +523,21 @@ class GpuConv(GpuOp):
            imshp=None,
            max_threads_dim0=None):
        """
-        :param version: each version of c_code implement many kernel for the
+        :param version: each version of c_code implements many kernel for the
                        convolution. By default we try to guess the best one.
                        You can force one version with this parameter. This
                        parameter is used by the tests.
        :param verbose: for value of 1,2 and 3. Print more information during
                        the execution of the convolution. Mostly used for
                        optimization or debugging.
-        :param kshp:    The size of the kernel. If provided, can genera
+        :param kshp:    The size of the kernel. If provided, can generate
                        faster code. If the GpuConv op is automatically
                        inserted,
                        we take its value automatically from the Conv op.
        :param imshp:   The size of the image. Not used for code generation but
-                        allow to select an experimental new version in another
+                        allows to select an experimental new version in another
                        repo.
-        :param max_threads_dim0: The maximum number of thread for the
+        :param max_threads_dim0: The maximum number of threads for the
                        block size dimensions 0 (blockDim.x) used by the
                        GPU function.

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
-// REMEMBER TO RAISE c_code_cache_version when changing this file
+// REMEMBER TO INCREASE c_code_cache_version when changing this file
 //
 enum { ConvMode_FULL, ConvMode_VALID };
 PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
-// REMEMBER TO RAISE c_code_cache_version when changing this file
+// REMEMBER TO INCREASE c_code_cache_version when changing this file
 //
 //implement the valid convolution only

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -654,15 +654,6 @@ class GpuAlloc(HideC, Alloc):
 gpu_alloc = GpuAlloc()
-class GpuShape(HideC, tensor.Shape):
-    """
-    Implement Shape on the gpu.
-    """
-    def make_node(self, x):
-        return Apply(self, [x], [tensor.lvector()])
-gpu_shape = GpuShape()
 class GpuReshape(HideC, tensor.Reshape):
    """
    Implement Reshape on the gpu.

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
 import copy
 from itertools import izip
+from StringIO import StringIO
 import numpy
 from theano import Op, Apply, scalar, config
-from theano.tensor.elemwise import Elemwise, DimShuffle, CAReduceDtype
+from theano import scalar as scal
+from theano.scalar import Scalar
+from theano.tensor.elemwise import (Elemwise, DimShuffle,
+                                    CAReduce, CAReduceDtype)
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
@@ -165,7 +169,8 @@ class GpuElemwise(HideC, Elemwise):
        return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
    def c_headers(self):
-        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>']
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
+                '<compyte/ext_cuda.h>']
    def c_compiler(self):
        return NVCC_compiler
@@ -201,8 +206,7 @@ class GpuElemwise(HideC, Elemwise):
 #define GDIM_1 gridDim.y
 #define GDIM_2 gridDim.z
 """
-        res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);",
+        res = [CLUDA_PREAMBLE]
-               CLUDA_PREAMBLE]
        for i in range(0, nd + 1):
            res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
        res.append(k.contig_src + ';')
@@ -210,8 +214,7 @@ class GpuElemwise(HideC, Elemwise):
        return '\n'.join(res)
    def c_init_code(self):
-        return ['cuda_get_ptr = (CUdeviceptr (*)(gpudata *g))'
+        return ['setup_ext_cuda();']
-                'compyte_get_extension("cuda_get_ptr");']
    def c_code(self, node, name, inputs, outputs, sub):
        nd = node.outputs[0].ndim
@@ -413,7 +416,7 @@ class GpuElemwise(HideC, Elemwise):
    def c_code_cache_version(self):
        ver = self.scalar_op.c_code_cache_version()
        if ver:
-            return (1, ver)
+            return (2, ver)
        else:
            return ver
@@ -515,10 +518,1786 @@ class GpuDimShuffle(HideC, DimShuffle):
        return process
    def c_code_cache_version(self):
-        return (3,)
+        return (4,)
-class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
+class GpuCAReduceCuda(HideC, CAReduce):
+    """GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.
+    The dimensions along which to reduce is specified by the
+    `reduce_mask` that you pass to the constructor.  The `reduce_mask`
+    is a tuple of booleans (actually integers 0 or 1) that specify for
+    each input dimension, whether to reduce it (1) or not (0).
+    For example, when scalar_op is a theano.scalar.basic.Add instance:
+      - reduce_mask == (1,) sums a vector to a scalar
+      - reduce_mask == (1,0) computes the sum of each column in a matrix
+      - reduce_mask == (0,1) computes the sum of each row in a matrix
+      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
+    :note: any reduce_mask of all zeros is a sort of 'copy', and may
+           be removed during graph optimization
+    This Op is a work in progress.
+    This op was recently upgraded from just GpuSum a general CAReduce. Not
+    many code cases are supported for scalar_op being anything other than
+    scal.Add instances yet.
+    Important note: if you implement new cases for this op, be sure to
+    benchmark them and make sure that they actually result in a speedup.
+    GPUs are not especially well-suited to reduction operations so it is
+    quite possible that the GPU might be slower for some cases.
+    """
+    def __init__(self, scalar_op, axis=None,
+                 reduce_mask=None):
+        if reduce_mask is not None:
+            reduce_mask = tuple(reduce_mask)
+        self.reduce_mask = reduce_mask
+        # used to make sure that calls to scalar op
+        # have unique name arguments
+        self._n_scalar_op_calls = 0
+        if not hasattr(scalar_op, 'identity'):
+            raise ValueError("No identity on scalar op")
+        CAReduce.__init__(self, scalar_op, axis=axis)
+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.axis == other.axis and
+                self.reduce_mask == other.reduce_mask and
+                self.scalar_op == other.scalar_op)
+    def __hash__(self):
+        return (hash(type(self)) ^
+                hash(self.axis) ^
+                hash(self.reduce_mask) ^
+                hash(type(self.scalar_op)))
+    def __str__(self):
+        ax = ''
+        if self.axis is not None:
+            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
+        return "GpuCAReduceCuda{%s}%s" % (str(self.scalar_op), ax)
+    def make_node(self, x):
+        x = as_gpuarray_variable(x)
+        assert x.dtype == "float32"
+        ret = super(GpuCAReduceCuda, self).make_node(x)
+        self = copy.copy(self)
+        self.axis = ret.op.axis
+        if self.reduce_mask is None:
+            if self.axis is None:
+                reduce_mask = [1] * x.type.ndim
+            else:
+                reduce_mask = [0] * x.type.ndim
+                for a in self.axis:
+                    assert reduce_mask[a] == 0
+                    reduce_mask[a] = 1
+            self.reduce_mask = tuple(reduce_mask)
+        if (x.type.ndim != len(self.reduce_mask)):
+            raise TypeError("x must have rank %i" % len(self.reduce_mask))
+        return Apply(self, [x], [GpuArrayType(x.dtype,
+                                              ret.outputs[0].type.broadcastable)()])
+    """
+    This method must be commented, because there's no way
+    to communicate that it's OK to call for + but not for
+    max
+    def perform(self, node, inp, out):
+        x, = inp
+        z, = out
+        # reduce_max is declared but does nothing but
+        # raise NotImplementedError.
+        # We can't call it here anyway because it hasn't
+        # been added to the python bindings yet
+        z[0] = x.reduce_sum(self.reduce_mask)
+    """
+    def perform(self, node, inp, out):
+        raise MethodNotDefined("")
+    def supports_c_code(self, inputs):
+        """ Returns True if the current op and reduce pattern
+            has functioning C code """
+        # If we don't even have the right method, we certainly
+        # don't support the C code
+        # (This is the test that used to be implemented by
+        # local_gpu_sum)
+        pattern = (''.join(str(i) for i in self.reduce_mask))
+        if not hasattr(self, 'c_code_reduce_%s' % pattern):
+            return False
+        # Now that this is a general reduction op, we might
+        # have a method for a pattern, but that pattern
+        # might not be implemented for the current scalar op.
+        # To detect this more complicated situation, we
+        # make fake arguments to c_code, try to run them,
+        # and see if NotImplementedError gets raised.
+        node = self.make_node(*inputs)
+        name = 'fake_name'
+        inp = ['fake_input_name_%d' % i for i in xrange(len(inputs))]
+        out = ['fake_output_name_%d' % i for i in xrange(len(node.outputs))]
+        sub = {'fail': 'fake failure code'}
+        try:
+            self.c_code(node, name, inp, out, sub)
+            self.c_support_code_apply(node, name)
+        except NotImplementedError:
+            return False
+        return True
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
+                '<compyte/ext_cuda.h>']
+    def c_compiler(self):
+        return NVCC_compiler
+    def c_init_code(self):
+        return ['setup_ext_cuda();']
+    def c_code(self, node, name, inp, out, sub):
+        x, = inp
+        z, = out
+        nd_in = node.inputs[0].type.ndim
+        nd_out = node.outputs[0].type.ndim
+        assert nd_in - nd_out == sum(self.reduce_mask)
+        sio = StringIO()
+        fail = sub['fail']
+        #check input
+        print >> sio, """
+        if (PyGpuArray_NDIM(%(x)s) != %(nd_in)s)
+        {
+            PyErr_Format(PyExc_TypeError,
+                         "required nd=%(nd_in)s, got nd=%%i", PyGpuArray_NDIM(%(x)s));
+            %(fail)s;
+        }
+        """ % locals()
+        # It might be nice to use a property of the op class to do this,
+        # but tensor.elemwise.CAReduce has this exact same check so I guess
+        # this is OK to do
+        if self.scalar_op in [scal.minimum, scal.maximum]:
+            conds = ["(PyGpuArray_DIMS(%s)[%d] == 0)" % (x, i)
+                     for i in xrange(nd_in)
+                     if self.reduce_mask[i]]
+            assert len(conds) > 0
+            cond = "(" + " || ".join(conds) + ")"
+            print >> sio, """
+            if %(cond)s
+            {
+                PyErr_Format(PyExc_ValueError," tried to reduce a 0-length axis.");
+                %(fail)s;
+            }
+            """ %locals()
+        #
+        # alloc an output if we need one
+        #
+        # check the basics of out output
+        print >> sio, """
+        if (  !%(z)s
+           || (PyGpuArray_NDIM(%(z)s) != %(nd_out)s)
+        """ % locals()
+        #ensure that the output has the right non-reduced dimensions
+        j = 0
+        for i in xrange(nd_in):
+            if not self.reduce_mask[i]:
+                print >> sio, " || (PyGpuArray_DIMS(%(z)s)[%(j)s] != PyGpuArray_DIMS(%(x)s)[%(i)d]) " % locals()
+                j += 1
+        print >> sio, """
+           )
+        {
+            """ % locals()
+        if nd_out > 0:
+            print >> sio, "size_t new_dims[%(nd_out)s]; " % locals()
+        else:
+            print >> sio, "size_t *new_dims=NULL; "
+        j = 0
+        for i in xrange(nd_in):
+            if not self.reduce_mask[i]:
+                print >> sio, 'new_dims[%(j)s] = PyGpuArray_DIMS(%(x)s)[%(i)s];' % locals()
+                j += 1
+        out_typecode = dtype_to_typecode(node.outputs[0].dtype)
+        print >> sio, """
+            Py_XDECREF(%(z)s);
+            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
+                                %(out_typecode)s, GA_C_ORDER,
+                                pygpu_default_context(),
+                                Py_None);
+            if (NULL == %(z)s)
+            {
+                PyErr_Format(PyExc_RuntimeError, "Failed to allocate output");
+                %(fail)s;
+            }
+        }
+        """ % locals()
+        # \begin bracket the reduction in a check that there is
+        # actually work to do
+        if getattr(self.scalar_op, 'identity', None) == 0:
+            zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
+        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
+        else:
+            scalar_op = self.scalar_op
+            zero_shp = """
+            PyErr_Format(PyExc_NotImplementedError,
+                         "GpuCAReduceCuda not implemented when input shape is 0"
+                         " for this scalar_op: %(scalar_op)s");
+            %(fail)s;
+            """ % locals()
+        print >> sio, """
+        if (PyGpuArray_SIZE(%(z)s) && ! PyGpuArray_SIZE(%(x)s)){
+            %(zero_shp)s;
+        }
+        else if (PyGpuArray_SIZE(%(z)s))
+        {
+        """ % locals()
+        #
+        # Now perform the reduction
+        #
+        if all(i == 1 for i in self.reduce_mask):
+            #check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
+            #TODO: check if we are ccontiguous when we un-dimshuffle
+            #TODO: if only some dims are ccontiguous, call version with less dims.
+            print >> sio, 'if(%(x)s->ga.flags & GA_C_CONTIGUOUS){'%locals()
+            self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
+            print >> sio, "}else{"
+            getattr(self, 'c_code_reduce_%s'%(''.join(
+                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+            print >> sio, "}"
+        else:
+            getattr(self, 'c_code_reduce_%s'%(''.join(
+                str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail)
+        # \end bracket the reduction ...
+        print >> sio, """
+        }
+        """ % locals()
+        return sio.getvalue()
+    def _makecall(self, node, name, x, z, fail, pattern=None):
+        """Return a string for making a kernel call.
+            The return value looks something like:
+            .. code-block:: c
+                if (verbose)
+                    printf("running kernel_reduce_10_%(name)s\\n");
+                int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
+                                                n_shared>>>(
+                        PyGpuArray_DIMS(%(x)s)[0],
+                        PyGpuArray_DIMS(%(x)s)[1],
+                        (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0]/4,
+                        PyGpuArray_STRIDES(%(x)s)[1]/4,
+                        (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0]/4
+                        );
+                [
+        if config.gpuarray.sync:
+            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
+                ]
+                if (cudaSuccess != cudaGetLastError())
+                {
+                    PyErr_Format(PyExc_RuntimeError, "Cuda error: ... );
+                    %(fail)s;
+                }
+        """
+        sio = StringIO()
+        if pattern is None:
+            pattern = ''.join(str(c) for c in self.reduce_mask)
+        ndim = len(self.reduce_mask)
+        nd_out = ndim - sum(self.reduce_mask)
+        shapes_format = "shape=(%s)" % ",".join(["%llu"] * node.inputs[0].ndim)
+        shapes_data = ",".join(["(unsigned long long) PyGpuArray_DIMS(%s)[%d]" % (x, i)
+                                for i in range(node.inputs[0].ndim)])
+        print >> sio, """
+            if (verbose)
+                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            if (verbose>1)
+                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
+                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
+                       " nb_block=%%d, n_shared=%%d, %(shapes_format)s\\n",
+                                  n_threads.x,n_threads.y,n_threads.z,
+                                  n_threads.x*n_threads.y*n_threads.z,
+                                  n_blocks.x,n_blocks.y,
+                                  n_blocks.x*n_blocks.y, n_shared, %(shapes_data)s);
+            kernel_reduce_%(pattern)s_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+            """ % locals()
+        for i in xrange(ndim):
+            print >> sio, """
+                    PyGpuArray_DIMS(%(x)s)[%(i)s],
+            """ % locals()
+        print >> sio, """
+                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
+            """ % locals()
+        for i in xrange(ndim):
+            print >> sio, """
+                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/4
+            """ % locals()
+        print >> sio, """
+                    ,(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
+            """ % locals()
+        for i in xrange(nd_out):
+            print >> sio, """
+                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/4
+            """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+        print >> sio, """
+                    );
+            %(sync)s
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+                    "Cuda error: %%s: %%s."
+                    " (grid: %%i x %%i; block: %%i x %%i x %%i)"
+                    " %(shapes_format)s \\n",
+                    "kernel_reduce_%(pattern)s_%(name)s",
+                    cudaGetErrorString(sts),
+                    n_blocks.x,
+                    n_blocks.y,
+                    n_threads.x,
+                    n_threads.y,
+                    n_threads.z,
+                    %(shapes_data)s);
+                %(fail)s;
+            }
+        """ % locals()
+        return sio.getvalue()
+    def _k_decl(self, node, nodename, pattern=None,
+                ndim=None, reduce_mask=None):
+        """Return a string to declare a kernel function
+        The result will look something like this:
+        .. code-block:: c
+            static __global__ void kernel_reduce_110_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const int d2,
+                    const float *A,
+                    const int sA0,
+                    const int sA1,
+                    const int sA2,
+                    float * Z,
+                    const int sZ0)
+            Since the nodename is unique, we don't need to put the name
+            of the scalar_op in here.
+        """
+        if reduce_mask is None:
+            reduce_mask = self.reduce_mask
+        if ndim is None:
+            ndim = len(reduce_mask)
+        if pattern is None:
+            pattern = ''.join(str(i) for i in reduce_mask)
+        sio = StringIO()
+        print >> sio, """
+            static __global__ void kernel_reduce_%(pattern)s_%(nodename)s(
+        """ % locals()
+        for i in xrange(ndim):
+            print >> sio, """
+                    const int d%(i)s,
+        """ % locals()
+        print >> sio, """
+                    const float *A,
+        """ % locals()
+        for i in xrange(ndim):
+            print >> sio, """
+                    const int sA%(i)s,
+        """ % locals()
+        print >> sio, """
+                    float * Z
+        """ % locals()
+        for i in xrange(ndim - sum(reduce_mask)):
+            print >> sio, """
+                    , const int sZ%(i)s
+        """ % locals()
+        print >> sio, ")"
+        return sio.getvalue()
+    def _k_init(self, *args):
+        return """
+                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
+                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
+                + threadIdx.y * blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float myresult = 0.0f;
+                //This is caught in cuda/init.py when we init the gpu. I keep
+                //it here to ease finding code that rely on this.
+                if (warpSize != 32)
+                {
+                    Z[0] = -666;
+                    return;
+                }
+        """
+    def _assign_init(self, first_item):
+        """
+        This return the initial value for myresult.
+        If the scalar op have an identity value, return it.
+        Otherwise, check that the scalar op is maximum or minimum
+        and return first_item. It should be the first element of the reduction.
+        As the maximum and minimum of the same value don't change, this work.
+        """
+        if hasattr(self.scalar_op, 'identity'):
+            return str(self.scalar_op.identity)
+        else:
+            assert isinstance(self.scalar_op, (scal.Maximum,
+                                               scal.Minimum))
+            return first_item
+    def _assign_reduce(self, node, name, left, right, sub):
+        """
+            node: the node argument to this op's c_code
+            name: the name argument to this op's c_code
+            left: a C code string identifying an lvalue
+            right: a C code string identifying an expression
+            sub: the sub argument to this op's c_code
+            returns C code to reduce left and right, assigning the
+            result to left."""
+        x, = node.inputs
+        dtype = x.dtype
+        dummy_left = Scalar(dtype=dtype)()
+        dummy_right = Scalar(dtype=dtype)()
+        dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)
+        dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
+        self._n_scalar_op_calls += 1
+        return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
+                                     (left,), sub)
+    def _k_reduce_buf(self, z_pos, node, name, sub):
+        """
+        WRITEME
+        node, name, sub: these should be passed through from the original
+        call to c_code
+        """
+        # This code (the code in new_version) is currently ignored.
+        # Code produced later in this function is returned instead.
+        # The code here works with all nvidia driver
+        # But only for powers or multiples of 2!
+        new_version = """
+        __syncthreads(); // some kernel do multiple reduction.
+        buf[threadNum] = myresult;
+        __syncthreads();
+        if (threadNum >= ((threadCount >> 1) * 2))
+        {
+            int idx = threadNum - (threadCount >> 1) * 2;"""
+        new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub)
+        new_version += """
+        }
+        __syncthreads();
+        // Works for power of 2 only.
+        int nTotalThreads = threadCount; // Total number of active threads
+        while(nTotalThreads > 1)
+        {
+            int halfPoint = (nTotalThreads >> 1);        // divide by two
+            // only the first half of the threads will be active.
+            if (threadNum < halfPoint)
+            {
+              // Get the shared value stored by another thread
+              float temp = buf[threadNum + halfPoint];
+              """
+        new_version += self._assign_reduce(node, name,
+                                           'buf[threadNum]', 'temp', sub)
+        new_version += """
+            }
+            __syncthreads();
+            nTotalThreads = (nTotalThreads >> 1);        // divide by two.
+        }
+            __syncthreads();
+        if (threadNum == 0)
+        {
+            %(z_pos)s = buf[0];
+        }
+            __syncthreads();"""
+        new_version = new_version % locals()
+        current_version = """
+        __syncthreads(); // some kernel do multiple reduction.
+        buf[threadNum] = myresult;
+        __syncthreads();
+        // rest of function is handled by one warp
+        if (threadNum < warpSize)
+        {
+            //round up all the partial sums into the first `warpSize` elements
+            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+            {
+                """
+        current_version += self._assign_reduce(node, name,
+                                               'myresult', 'buf[i]', sub) + """
+            }
+            buf[threadNum] = myresult;
+        /*Comment this optimization as it don't work on Fermi GPU.
+        TODO: find why it don't work or put the GPU compute capability into the version
+            // no sync because only one warp is running
+            if(threadCount >32)
+            {"""
+        for num in [16, 8, 4, 2, 1]:
+            current_version += self._assign_reduce(node, name,
+                                                   'buf[threadNum]',
+                                                   'buf[threadNum+%d]' % num,
+                                                   sub)
+        current_version += """
+                if (threadNum == 0)
+                {
+                    %(z_pos)s = buf[0];
+                }
+            }
+            else */
+            if (threadNum < 16)
+            {
+                //reduce so that threadNum 0 has the reduction of everything
+                """
+        for num in [16, 8, 4, 2, 1]:
+            this_if = "if (threadNum + %d < threadCount) " % num + \
+                self._assign_reduce(node, name,
+                                    'buf[threadNum]','buf[threadNum+%d]' % num,
+                                    sub)
+            current_version += this_if
+        current_version += """
+                if (threadNum == 0)
+                {
+                    %(z_pos)s = buf[0];
+                }
+            }
+        }
+        """
+        current_version = current_version % locals()
+        return current_version
+    #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
+    #nb_reduce<=warpSize
+    def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
+        reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {})
+        return """
+        __syncthreads(); // some kernel do multiple reduction.
+        buf[threadNum] = myresult;
+        __syncthreads();
+        // rest of function is handled by one warp
+        if (threadNum < %(nb_reduce)s)
+        {
+            //round up all the partial sums into the first `nb_reduce` elements
+            for (int i = threadNum + %(nb_reduce)s; i < threadCount; i += %(nb_reduce)s)
+            {
+                %(reduce_fct)s;
+            }
+            %(z_pos)s = myresult;
+        }
+        """ % locals()
+    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
+        """
+        WRITEME
+        IG: I believe, based on how this is called in c_code, that it
+        is for the case where we are reducing on all axes and x is
+        C contiguous.
+        """
+        if getattr(self.scalar_op, 'identity', None) == 0:
+            zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
+        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
+        else:
+            zero_shp = """
+            PyErr_Format(PyExc_NotImplementedError,
+                         "GpuCAReduceCuda not implemented when input shape is 0 for this scalar_op");
+            %(fail)s;
+            """ % locals()
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+        print >> sio, """
+        {
+          if(PyGpuArray_SIZE(%(x)s)==0){
+            %(zero_shp)s;
+          }else{
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_SIZE(%(x)s),
+                            (size_t) 256));
+            dim3 n_blocks(1);
+            if (verbose) printf("running kernel_reduce_ccontig_%(name)s"
+                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
+                                n_threads.x,PyGpuArray_SIZE(%(x)s),
+                                PyGpuArray_NDIM(%(x)s));
+            int n_shared = sizeof(float) * n_threads.x;
+            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    PyGpuArray_SIZE(%(x)s),
+                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                    (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
+            %(sync)s
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+                             "Cuda error: %%s: %%s."
+                             " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
+                    "kernel_reduce_ccontig_%(name)s",
+                    cudaGetErrorString(sts),
+                    n_blocks.x,
+                    n_blocks.y,
+                    n_threads.x,
+                    n_threads.y,
+                    n_threads.z);
+                %(fail)s;
+            }
+         }
+        }
+        """ % locals()
+    def c_code_reduce_1(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[0],
+                            (size_t) 256));
+            dim3 n_blocks(1);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_11(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[1],
+                            (size_t) 256));
+            while (n_threads.y * n_threads.x <= 256) ++n_threads.y;
+            n_threads.y -= 1;
+            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads.y = PyGpuArray_DIMS(%(x)s)[0];
+            dim3 n_blocks(1);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
+        """
+        :param N: the number of 1 in the pattern N=1 -> 01, N=2 -> 011 N=3 ->0111
+                  Work for N=1,2,3
+        """
+        assert N in [1, 2, 3]
+        makecall = self._makecall(node, name, x, z, fail)
+        N_pattern = ''.join(['1'] * N)
+        param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i)
+                              for i in xrange(N + 1)])
+        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/4"
+                                % (x, i) for i in xrange(N + 1)])
+        threads_y = """
+            //get as many y threads as we can fit
+            while (n_threads.x * (n_threads.y+1) <= 256)
+            {
+                if (n_threads.y < PyGpuArray_DIMS(%(x)s)[%(N)s-1])
+                    n_threads.y += 1;
+                else
+                    break;
+            }""" % locals()
+        threads_z = """
+            //get as many z threads as we can fit
+            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256)
+            {
+                if (n_threads.z < PyGpuArray_DIMS(%(x)s)[%(N)s-2])
+                    n_threads.z += 1;
+                else
+                    break;
+            }""" % locals()
+        if len(self.reduce_mask) == 2:
+            threads_y = ''
+            threads_z = ''
+        if len(self.reduce_mask) == 3:
+            threads_z = ''
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[%(N)s],
+                            (size_t) 256));
+            %(threads_y)s
+            %(threads_z)s
+            dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
+                                   (size_t) 4096));
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_01(self, sio, node, name, x, z, fail):
+        self.c_code_reduce_01X(sio, node, name, x, z, fail, 1)
+    def c_code_reduce_011(self, sio, node, name, x, z, fail):
+        self.c_code_reduce_01X(sio, node, name, x, z, fail, 2)
+    def c_code_reduce_0111(self, sio, node, name, x, z, fail):
+        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
+    def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[0],
+                            (size_t) 256));
+            dim3 n_blocks(1,
+                std::min(PyGpuArray_DIMS(%(x)s)[1],
+                    (size_t) 4096));
+            if (verbose) {
+              fprintf(stderr,
+                "running kernel_reduce_10_%(name)s n_blocks=(%%i,%%i)\\n",
+                n_blocks.x,
+                n_blocks.y);
+            }
+            assert( PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
+            int n_shared = sizeof(float) * n_threads.x;
+            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    1,
+                    PyGpuArray_DIMS(%(x)s)[0],
+                    PyGpuArray_DIMS(%(x)s)[1],
+                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                    1,
+                    PyGpuArray_STRIDES(%(x)s)[0]/4,
+                    PyGpuArray_STRIDES(%(x)s)[1]/4,
+                    (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                    1,
+                    PyGpuArray_STRIDES(%(z)s)[0]/4
+                    );
+            %(sync)s
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess != sts)
+            {
+                PyErr_Format(PyExc_RuntimeError,
+                    "Cuda error: %%s: %%s."
+                    " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
+                    "kernel_reduce_010_%(name)s",
+                    cudaGetErrorString(sts),
+                    n_blocks.x,
+                    n_blocks.y,
+                    n_threads.x,
+                    n_threads.y,
+                    n_threads.z);
+                %(fail)s;
+            }
+        }
+        """ % locals()
+    def c_code_reduce_010(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        makecall_inner = self._makecall(node, name, x, z, fail,
+                                        pattern="010_inner")
+        pattern = ''.join(str(i) for i in self.reduce_mask)
+        sync = ""
+        if config.gpuarray.sync:
+            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
+        print >> sio, """
+        {
+            //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
+            //if ((n_summations >= 15 * 32) && (PyGpuArray_DIMS(%(x)s)[2]>=16))
+            if (1) // if the alternative is less buggy, consider not using this branch
+            {
+                // If there are a lot of summations to do, then we can use simple parallelization -
+                // use each thread to do one sum.
+                // we might as well launch blocks of 32 threads because that's the warp size.
+                // we could schedule more threads if we were maxing out the gridsize below, but
+                // the gridsize is way more than the physical hardware and I think 32 threads
+                // on a huge grid is enough to fully use the hardware.
+                dim3 n_threads(32,1,1);
+                // We kindof reshape the input implicitly to something 4D:
+                //  the shape A,B,C    ->   A, B, D, E
+                //  where C <= D*E < C+32
+                //  where E==32
+                int A = PyGpuArray_DIMS(%(x)s)[0];
+                int B = PyGpuArray_DIMS(%(x)s)[1];
+                int C = PyGpuArray_DIMS(%(x)s)[2];
+                int D = C/32;
+                if (32*D < C) D+= 1;
+                assert ((C <= 32*D) && (32*D < C+32));
+                // The gridsize would ideally be (A, D).  But we do the following logic to make
+                // sure we don't ask for a grid that is too big.
+                dim3 n_blocks(A,D);
+                if (n_blocks.x > 4096) n_blocks.x = 4096;
+                if (n_blocks.x*n_blocks.y > 4096) n_blocks.y = 4096/n_blocks.x;
+                int n_shared = 0;
+                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                A,B,C,D,
+                        (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0]/4,
+                        PyGpuArray_STRIDES(%(x)s)[1]/4,
+                        PyGpuArray_STRIDES(%(x)s)[2]/4,
+                        (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0]/4,
+                        PyGpuArray_STRIDES(%(z)s)[1]/4
+                        );
+                %(sync)s
+                cudaError_t sts = cudaGetLastError();
+                if (cudaSuccess != sts)
+                {
+                    PyErr_Format(PyExc_RuntimeError,
+                        "Cuda error: %%s: %%s."
+                        " (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
+                        "kernel_reduce_010_%(name)s",
+                        cudaGetErrorString(sts),
+                        n_blocks.x,
+                        n_blocks.y,
+                        n_threads.x,
+                        n_threads.y,
+                        n_threads.z);
+                    %(fail)s;
+                }
+            }
+            else
+            {
+                int verbose = 2;
+                  dim3 n_threads(std::min((size_t) 32,
+                                          PyGpuArray_DIMS(%(x)s)[2]));
+                  while(    (n_threads.x*(n_threads.y+1)<=256)
+                         && (n_threads.y<PyGpuArray_DIMS(%(x)s)[1])){
+                      n_threads.y++;
+                  }
+                  dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[0],
+                                (size_t)4096));
+                  n_blocks.y = std::min(
+                      ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
+                                  (size_t)n_threads.x),
+                      (size_t)(4096 / n_blocks.x)
+                      );
+                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/4,
+                                     PyGpuArray_STRIDES(%(x)s)[1]/4),
+                            PyGpuArray_STRIDES(%(x)s)[2]/4)
+                   ==PyGpuArray_STRIDES(%(x)s)[2]/4
+                  && n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
+                                             (size_t)n_threads.x)){
+                  if(verbose>1)
+                    printf("n_block.x.1=%%d, n_block.x.2=%%d, n_block.y.1=%%d, n_block.y.2=%%d,\\n",
+                           PyGpuArray_DIMS(%(x)s)[0],4096,
+                           ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],(size_t)n_threads.x),
+                                       (size_t)(4096 / n_blocks.x));
+                  assert(n_threads.x<=32);
+                  %(makecall_inner)s
+                }else{
+                  n_threads.x = std::min(PyGpuArray_DIMS(%(x)s)[1],
+                                         (size_t) 256);
+                  n_blocks.x = std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t)4096);
+                  n_blocks.y = std::min(
+                      PyGpuArray_DIMS(%(x)s)[2],
+                      (size_t)(4096 / n_blocks.x)
+                      );
+                  %(makecall)s
+                }
+                %(sync)s
+                cudaError_t sts = cudaGetLastError();
+                if (cudaSuccess != sts)
+                {
+                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
+                        "kernel_reduce_%(pattern)s_%(name)s",
+                        cudaGetErrorString(sts),
+                        n_blocks.x,
+                        n_blocks.y,
+                        n_threads.x,
+                        n_threads.y,
+                        n_threads.z);
+                    %(fail)s;
+                }
+            }
+        }
+        """ % locals()
+    def c_code_reduce_0101(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[3],
+                             (size_t) 256));
+            while (n_threads.x * n_threads.y <= 256)
+            {
+                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1]) break;
+                n_threads.y += 1;
+            }
+            n_threads.y -= 1;
+            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[2]);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_100(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        # use threadIdx.x for i0
+        # use blockIdx.x for i1
+        # use blockIdx.y for i2
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[0],
+                             (size_t) 256));
+            dim3 n_blocks(std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096));
+            while (n_blocks.x * (n_blocks.y+1) <= 4096 && n_blocks.y <= PyGpuArray_DIMS(%(x)s)[2])
+            {
+                n_blocks.y += 1;
+            }
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_110(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[1],
+                             (size_t) 256));
+            while (n_threads.x*n_threads.y <= 256)
+            {
+                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[0])
+                    break;
+                n_threads.y += 1;
+            }
+            n_threads.y -= 1;
+            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[2]);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_001(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[2],
+                             (size_t) 256));
+            dim3 n_blocks(
+                    std::min(PyGpuArray_DIMS(%(x)s)[0],
+                             (size_t) 4096));
+            while (n_blocks.x * n_blocks.y <= 4096)
+            {
+                if (n_blocks.y > PyGpuArray_DIMS(%(x)s)[1])
+                    break;
+                n_blocks.y += 1;
+            }
+            n_blocks.y -= 1;
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_111(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[2],
+                             (size_t) 256));
+            //get as many y threads as we can fit
+            while (n_threads.x * n_threads.y <= 256)
+            {
+                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
+                    break;
+                n_threads.y += 1;
+            }
+            n_threads.y -= 1;
+            //get as many z threads as we can fit
+            while (n_threads.x * n_threads.y * n_threads.z <= 256)
+            {
+                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                    break;
+                n_threads.z += 1;
+            }
+            n_threads.z -= 1;
+            dim3 n_blocks(1,1,1);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_blocks(
+                    std::min(PyGpuArray_DIMS(%(x)s)[0],
+                             (size_t) 4096));
+            while (n_blocks.x * n_blocks.y <= 4096 &&
+                   n_blocks.y < PyGpuArray_DIMS(%(x)s)[1])
+            {
+                n_blocks.y += 1;
+            }
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[3],
+                             (size_t) 256));
+            while (n_threads.x * n_threads.y <= 256
+                   && n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
+                   && n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
+            {
+                n_threads.y += 1;
+            }
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[2],
+                             (size_t) 256));
+            //get as many y threads as we can fit
+            while (n_threads.x * n_threads.y <= 256)
+            {
+                if (n_threads.y > PyGpuArray_DIMS(%(x)s)[1])
+                    break;
+                n_threads.y += 1;
+            }
+            n_threads.y -= 1;
+            //get as many z threads as we can fit
+            while (n_threads.x * n_threads.y * n_threads.z <= 256)
+            {
+                if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                    break;
+                n_threads.z += 1;
+            }
+            n_threads.z -= 1;
+            //Maximum for Fermi GPU on that dimensions.
+            n_threads.z = std::min(n_threads.z, (unsigned)64);
+            dim3 n_blocks(1,1,1);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        makecall = self._makecall(node, name, x, z, fail)
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(PyGpuArray_DIMS(%(x)s)[3],
+                             (size_t) 256));
+            while (n_threads.x * (n_threads.y+1) <= 256) ++n_threads.y;
+            if (n_threads.y > PyGpuArray_DIMS(%(x)s)[2])
+                n_threads.y = PyGpuArray_DIMS(%(x)s)[2];
+            while (n_threads.x * n_threads.y * (n_threads.z+1) <= 256) ++n_threads.z;
+            if (n_threads.z > 64)
+                n_threads.z = 64;
+            if (n_threads.z > PyGpuArray_DIMS(%(x)s)[0])
+                n_threads.z = PyGpuArray_DIMS(%(x)s)[0];
+            dim3 n_blocks(PyGpuArray_DIMS(%(x)s)[1]);
+            %(makecall)s
+        }
+        """ % locals()
+    def c_code_cache_version_apply(self, node):
+        version = [8]  # the version corresponding to the c code in this Op
+        # now we insert versions for the ops on which we depend...
+        scalar_node = Apply(self.scalar_op,
+                [Scalar(dtype=input.type.dtype)() for input in node.inputs],
+                [Scalar(dtype=output.type.dtype)() for output in node.outputs])
+        version.extend(self.scalar_op.c_code_cache_version())
+        for i in node.inputs + node.outputs:
+            version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version())
+        if all(version):
+            return tuple(version)
+        else:
+            return ()
+    def c_support_code_apply(self, node, nodename):
+        sio = StringIO()
+        nd_in = len(self.reduce_mask)
+        if all(i == 1 for i in self.reduce_mask):
+            #this kernel is ok for up to a few thousand elements, but
+            # it only runs on ONE multiprocessor
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0]",
+                                             {})
+            reduce_init = self._assign_init("A[0]")
+            print >> sio, """
+            static __global__ void kernel_reduce_ccontig_%(nodename)s(
+                    const unsigned int d0,
+                    const float *A,
+                    float * Z)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                float myresult = %(reduce_init)s;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                {
+                    %(reduce_fct)s
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        if self.reduce_mask == (1,):
+            #this kernel is ok for up to a few thousand elements, but
+            # it only runs on ONE multiprocessor
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0]",
+                                             {})
+            reduce_init = self._assign_init("A[0]")
+            print >> sio, """
+            static __global__ void kernel_reduce_1_%(nodename)s(
+                    const unsigned int d0,
+                    const float *A, const int sA0,
+                    float * Z)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                float myresult = %(reduce_init)s;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                {
+                    %(reduce_fct)s
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        if self.reduce_mask == (1, 1):
+            #this kernel is ok for up to a few thousand elements, but
+            # it only runs on ONE multiprocessor
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1]",
+                                             {})
+            reduce_init = self._assign_init("A[0]")
+            print >> sio, """
+            static __global__ void kernel_reduce_11_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const float *A, const int sA0, const int sA1,
+                    float * Z)
+            {
+                const int threadCount = blockDim.x * blockDim.y;
+                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float myresult = %(reduce_init)s;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
+                {
+                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
+                    {
+                        %(reduce_fct)s;
+                    }
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        #01, 011, 0111
+        if (0 == self.reduce_mask[0] and
+            all(self.reduce_mask[1:]) and
+            nd_in in[2, 3, 4]):
+            # this kernel uses one block for each row.
+            # threads per block for each element per row.
+            N_pattern = ''.join(['1'] * (nd_in - 1))
+            # TODO: is it faster to hardcode sA3, etc. in the later code, rather
+            # than have the for_* variables declare them and the later code use
+            # their names?
+            if nd_in == 2:
+                for_i1 = "for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)"
+                first_i1 = 'threadIdx.x'
+                sA1 = 'sA1'
+                for_i2 = "int i2=0, sA2=0;"
+                sA2 = '0'
+                first_i2 = '0'
+                for_i3 = "int i3=0, sA3=0;"
+                sA3 = '0'
+                first_i3 = '0'
+            if nd_in == 3:
+                for_i1 = "for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)"
+                first_i1 = 'threadIdx.y'
+                sA1 = 'sA1'
+                for_i2 = "for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)"
+                first_i2 = 'threadIdx.x'
+                sA2 = 'sA2'
+                for_i3 = "int i3=0, sA3=0;"
+                first_i3 = 0
+                sA3 = '0'
+            if nd_in == 4:
+                for_i1 = "for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)"
+                first_i1 = 'threadIdx.z'
+                sA1 = 'sA1'
+                for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
+                first_i2 = 'threadIdx.y'
+                sA2 = 'sA2'
+                for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
+                first_i3 = 'threadIdx.x'
+                sA3 = 'sA3'
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]', node,
+                                           nodename, sub={})
+            param_dim = ",".join(["const int d%d" % i
+                                  for i in xrange(nd_in)])
+            param_strides = ",".join(["const int sA%d" % i
+                                      for i in xrange(nd_in)])
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            reduce_init = self._assign_init("A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0]" % locals())
+            reduce_fct = self._assign_reduce(
+                node, nodename, "myresult",
+                "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
+                {})
+            print >> sio, """
+                %(decl)s{
+                    %(init)s
+                    for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
+                      myresult = %(reduce_init)s;
+                      %(for_i1)s{
+                        %(for_i2)s{
+                          %(for_i3)s{
+                            %(reduce_fct)s;
+                          }
+                        }
+                      }
+                      %(reducebuf)s
+                    }
+                }
+                """ % locals()
+        if self.reduce_mask == (0, 1, 0) or self.reduce_mask == (1, 0):
+            # this kernel uses one block for each column,
+            # threads per block for each element per column.
+            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
+            #      c_contiguous (typical case) then each warp is accessing non-contigous
+            #      memory (a segment of a column).
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2*sZ1]',
+                                           node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
+            print >> sio, """
+            static __global__ void kernel_reduce_010_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const int d2,
+                    const float *A, const int sA0,
+                    const int sA1, const int sA2,
+                    float * Z, const int sZ0, const int sZ1)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
+                {
+                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
+                    {
+                        float myresult = %(reduce_init)s;
+                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                        %(reducebuf)s
+                    }
+                }
+            }
+            """ % locals()
+        if self.reduce_mask == (0, 1, 0):
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "X[a * sX0 + b * sX1 + c * sX2]",
+                                             {})
+            reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
+            print >> sio, """
+            static __global__ void kernel_reduce_010_AD_%(nodename)s(
+                    const int A,
+                    const int B,
+                    const int C,
+                    const int D,
+                    //const int E, // THIS is 32
+                    const float *X, const int sX0,
+                    const int sX1, const int sX2,
+                    float * Z, const int sZ0, const int sZ1)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                float myresult = 0.0f;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int a = blockIdx.x; a < A; a += gridDim.x)
+                {
+                    for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
+                    {
+                        int c = i2_D * 32 + threadIdx.x;
+                        if (c < C)
+                        {
+                            myresult = %(reduce_init)s;
+                            for (int b = 0; b < B; ++b)
+                            {
+                                %(reduce_fct)s;
+                            }
+                            Z[a * sZ0 + c * sZ1] = myresult;
+                        }
+                    }
+                }
+            }
+            """ % locals()
+        if self.reduce_mask == (0, 1, 0):
+            #
+            # This kernel is optimized when the inner most dimensions
+            # have the smallest stride.
+            # this kernel uses one block for multiple column(up to 32TODO),
+            # threads per block for each element per column.
+#thread.x = dim 2 contiguous
+#thread.y = dim 1
+#block.x = dim 0
+#block.y = dim 1 rest
+            init = self._k_init(node, nodename)
+            decl = self._k_decl(node, nodename, pattern="010_inner")
+            reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
+                                                    node, nodename,
+                                                    'blockDim.x')
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
+            print >> sio, """
+            %(decl)s
+            {
+             if(warpSize<blockDim.x){
+               //TODO: set error code
+               Z[0] = -666;
+               return;
+              }
+              %(init)s
+              for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
+              {
+                for (int i2 = blockIdx.y*blockDim.x+threadIdx.x; i2 < d2; i2 += gridDim.y*blockDim.x)
+                 {
+                  myresult = %(reduce_init)s;
+                  for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
+                  {
+                      %(reduce_fct)s;
+                  }
+                  %(reducebuf)s
+                 }
+              }
+            }
+            """ % locals()
+        if self.reduce_mask == (1, 1, 0):
+            # this kernel uses one block for each column,
+            # threads per block for each element per column.
+            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
+            #      c_contiguous (typical case) then each warp is accessing non-contigous
+            #      memory (a segment of a column).
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
+                                             {})
+            reduce_init = self._assign_init("A[blockIdx.x * sA2]")
+            print >> sio, """
+            static __global__ void kernel_reduce_110_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const int d2,
+                    const float *A, const int sA0,
+                    const int sA1, const int sA2,
+                    float * Z, const int sZ0)
+            {
+                const int threadCount = blockDim.x * blockDim.y;
+                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float myresult = %(reduce_init)s;
+                if (warpSize != 32)
+                {
+                    //TODO: set error code
+                    Z[blockIdx.x * sZ0] = -666;
+                    return;
+                }
+                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
+                {
+                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
+                    {
+                        %(reduce_fct)s;
+                    }
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        if self.reduce_mask == (1, 0, 0):
+            reducebuf = self._k_reduce_buf('Z[i1 * sZ0 + i2 * sZ1]',
+                                           node, nodename, sub={})
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
+            print >> sio, """
+            %(decl)s
+            {
+                %(init)s
+                for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
+                {
+                    for (int i1 = blockIdx.x; i1 < d1; i1 += gridDim.x)
+                    {
+                        myresult = %(reduce_init)s;
+                        for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                        {
+                            %(reduce_fct)s
+                        }
+                        %(reducebuf)s
+                    }
+                }
+            }
+            """ % locals()
+        if self.reduce_mask == (1, 1, 1):
+            reducebuf = self._k_reduce_buf('Z[0]', node,
+                                           nodename, sub={})
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            reduce_init = self._assign_init("A[0]")
+            print >> sio, """
+            %(decl)s
+            {
+                %(init)s
+                myresult = %(reduce_init)s;
+                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
+                {
+                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
+                    {
+                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                    }
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        if self.reduce_mask == (0, 0, 1):
+            # this kernel uses one block for each row,
+            # threads per block for each element per row.
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
+                                           node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
+                                             {})
+            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
+            print >> sio, """
+            static __global__ void kernel_reduce_001_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const int d2,
+                    const float *A, const int sA0,
+                    const int sA1, const int sA2,
+                    float * Z, const int sZ0, const int sZ1)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
+                {
+                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
+                    {
+                        float myresult = %(reduce_init)s;
+                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                        %(reducebuf)s
+                    }
+                }
+            }
+            """ % locals()
+        if self.reduce_mask == (0, 0, 1, 1):
+             # this kernel uses one block for each row,
+            # threads per block for each element per row.
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
+                                           node, nodename, sub={})
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
+                                             {})
+            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
+            print >> sio, """
+            %(decl)s
+            {
+                %(init)s
+                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
+                {
+                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
+                    {
+                        float myresult = %(reduce_init)s;
+                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                    }
+                        %(reducebuf)s
+                    }
+                }
+            }
+            """ % locals()
+        if self.reduce_mask == (0, 1, 0, 1):
+            # this kernel uses one block for each row,
+            # threads per block for each element per row.
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i2 * sZ1]',
+                                           node, nodename, sub={})
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
+                                             {})
+            reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
+            print >> sio, """
+            %(decl)s
+            {
+                %(init)s
+                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x)
+                {
+                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
+                    {
+                        float myresult = %(reduce_init)s;
+                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                    }
+                        %(reducebuf)s
+                    }
+                }
+            }
+            """ % locals()
+        if self.reduce_mask == (1, 1, 1, 1):
+            reducebuf = self._k_reduce_buf('Z[0]', node, nodename,
+                                           sub={})
+            decl = self._k_decl(node, nodename)
+            init = self._k_init(node, nodename)
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
+                                             {})
+            reduce_init = self._assign_init("A[0]")
+            print >> sio, """
+            %(decl)s
+            {
+                %(init)s
+                myresult = %(reduce_init)s;
+              for (int i0 = 0; i0 < d0; i0++)
+                for (int i1 = threadIdx.z; i1 < d1; i1 += blockDim.z)
+                {
+                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                    }
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        if self.reduce_mask == (1, 0, 1, 1):
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]',
+                                           node, nodename, sub={})
+            reduce_fct = self._assign_reduce(node, nodename, "myresult",
+                                             "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
+                                             {})
+            reduce_init = self._assign_init("A[blockIdx.x * sA1]")
+            print >> sio, """
+            static __global__ void kernel_reduce_1011_%(nodename)s(
+                    const unsigned int d0,
+                    const unsigned int d1,
+                    const unsigned int d2,
+                    const unsigned int d3,
+                    const float *A, const int sA0, const int sA1,
+                    const int sA2, const int sA3,
+                    float * Z, const int sZ0)
+            {
+                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
+                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float myresult = %(reduce_init)s;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
+                {
+                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
+                        {
+                            %(reduce_fct)s;
+                        }
+                    }
+                }
+                %(reducebuf)s
+            }
+            """ % locals()
+        print >> sio, """
+        template <typename T>
+        static T ceil_intdiv(T a, T b)
+        {
+            return (a/b) + ((a % b) ? 1: 0);
+        }
+        """
+        return sio.getvalue()
+class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
+    """CAReduce that reuse the python code from compyte.
+    Too slow for now as it only have a python interface.
+    """
    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
        if not hasattr(scalar_op, 'identity'):
            raise ValueError("No identity on scalar op")
@@ -550,7 +2329,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
    def make_thunk(self, node, storage_map, compute_map, no_recycling):
        # cache the kernel object
        self.get_kernel_cache(node)
-        return super(GpuCAReduce, self).make_thunk(node, storage_map,
+        return super(GpuCAReduceCPY, self).make_thunk(node, storage_map,
                                                   compute_map, no_recycling)
    def get_kernel_cache(self, node):
@@ -732,7 +2511,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
        err = GpuKernel_call(&%(k_var)s, 0, %(ls)s, gs, args);
        if (err != GA_NO_ERROR) {
            PyErr_Format(PyExc_RuntimeError,
-                         "compyte error: GpuCAReduce: %%s.",
+                         "compyte error: GpuCAReduceCPY: %%s.",
                         GpuKernel_error(&%(k_var)s, err));
            %(fail)s
        }
@@ -741,7 +2520,7 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
            err = GpuArray_move(&%(output)s->ga, &tmp->ga);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
-                             "compyte error: GpuCAReduce [cast]: %%s.",
+                             "compyte error: GpuCAReduceCPY [cast]: %%s.",
                             GpuArray_error(&tmp->ga, err));
                %(fail)s
            }
@@ -788,3 +2567,5 @@ class GpuCAReduce(GpuKernelBase, HideC, CAReduceDtype):
        else:
            output[0] = pygpu.gpuarray.array(input, copy=True,
                                             dtype=node.outputs[0].type.dtype)
+# To allow reloading old pickled files
+GpuCAReduce = GpuCAReduceCPY
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -14,9 +14,7 @@ from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               gpu_from_host,
                                               gpu_alloc,
-                                               gpu_shape,
                                               GpuAlloc,
-                                               GpuShape,
                                               GpuReshape,
                                               GpuEye)
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
@@ -24,7 +22,7 @@ from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                                          GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
-                                              GpuDimShuffle, GpuCAReduce)
+                                              GpuDimShuffle, GpuCAReduceCuda)
 from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
 from theano.sandbox.gpuarray.type import GpuArrayConstant
@@ -249,9 +247,57 @@ def local_gpua_incsubtensor(node):
 def local_gpua_careduce(node):
    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
        isinstance(node.op.scalar_op, scalar.basic.Mul)):
-        return GpuCAReduce(node.op.scalar_op, axis=node.op.axis,
+        x, = node.inputs
-                           dtype=getattr(node.op, 'dtype', None),
+        greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis)
-                           acc_dtype=getattr(node.op, 'acc_dtype', None))
+        if x.dtype != "float32":
+            return
+        gvar = greduce(x)
+        #We need to have the make node called, otherwise the mask can
+        #be None
+        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
+            return greduce
+        else:
+            # Try to make a simpler pattern based on reshaping
+            # The principle is that if two adjacent dimensions have
+            # the same value in the reduce_mask, then we can reshape
+            # to make them a single dimension, do the reduction, and
+            # then reshape to get them back.
+            if node.op.axis is None:
+                reduce_mask = [1] * x.type.ndim
+            else:
+                reduce_mask = [0] * x.type.ndim
+                for a in node.op.axis:
+                    assert reduce_mask[a] == 0
+                    reduce_mask[a] = 1
+            shape_of = node.fgraph.shape_feature.shape_of
+            x_shape = shape_of[x]
+            new_in_shp = [x_shape[0]]
+            new_mask = [reduce_mask[0]]
+            for i in xrange(1, x.type.ndim):
+                if reduce_mask[i] == reduce_mask[i - 1]:
+                    new_in_shp[-1] *= x_shape[i]
+                else:
+                    new_mask.append(reduce_mask[i])
+                    new_in_shp.append(x_shape[i])
+            new_greduce = GpuCAReduceCuda(new_mask, scalar_op)
+            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
+            gpu_reshaped_x = gpu_from_host(reshaped_x)
+            reshaped_gpu_inputs = [gpu_reshaped_x]
+            if new_greduce.supports_c_code(reshaped_gpu_inputs):
+                reduce_reshaped_x = host_from_gpu(
+                    new_greduce(gpu_reshaped_x))
+                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
+                    unreshaped_reduce = reduce_reshaped_x.reshape(
+                        tensor.stack(*shape_of[node.outputs[0]]))
+                else:
+                    unreshaped_reduce = reduce_reshaped_x
+                return [unreshaped_reduce]
 @register_opt()
@@ -296,20 +342,6 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
-@register_opt()
-@local_optimizer([tensor.Shape])
-def local_gpua_shape(node):
-    """
-    Can't use op_lifter as the output is on the GPU.
-    """
-    if isinstance(node.op, tensor.Shape):
-        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
-            gpu_x, = x.owner.inputs
-            return [gpu_shape(gpu_x)]
-    return False
 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])
 def local_gpu_conv(node):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -36,7 +36,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
                                               gpu_alloc, gpu_from_cuda,
                                               cuda_from_gpu, HostFromGpu,
                                               GpuFromHost, GpuReshape,
-                                               GpuEye, GpuShape)
+                                               GpuEye)
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -307,7 +307,7 @@ def test_shape():
    topo = f.maker.fgraph.toposort()
    assert numpy.all(f(v) == (3, 4, 5))
    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuShape)
+    assert isinstance(topo[0].op, T.Shape)
 class G_reshape(T_reshape):

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -10,7 +10,7 @@ from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
 from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
-                                              GpuCAReduce)
+                                              GpuCAReduceCuda, GpuCAReduceCPY)
 from theano.sandbox.gpuarray.type import GpuArrayType
 from pygpu.array import gpuarray
@@ -37,10 +37,11 @@ class test_gpu_Broadcast(test_Broadcast):
 class test_GpuDimShuffle(test_DimShuffle):
    op = GpuDimShuffle
-class test_GpuCAReduce(test_CAReduce):
+class test_GpuCAReduceCPY(test_CAReduce):
    dtypes = ["float32"]
    bin_dtypes = ["uint8", "int8"]
-    op = GpuCAReduce
+    op = GpuCAReduceCPY
    reds = [scalar.add, scalar.mul]
    def test_perform(self):
@@ -64,3 +65,87 @@ class test_GpuCAReduce(test_CAReduce):
            for op in self.reds:
                self.with_linker(gof.CLinker(), op, dtype=dtype,
                                 test_nan=True)
+    def test_infer_shape(self):
+        for dtype in self.dtypes:
+            test_CAReduce.test_infer_shape(self, dtype)
+class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
+    dtypes = ["float32"]
+    bin_dtypes = ["uint8", "int8"]
+    bin_dtypes = []
+    cases = [((5, 6), None),
+             ((5, 6), (0, 1)),
+             ((5, 6), (0, )),
+             ((5, 6), (1, )),
+             ((5, 6), (-1, )),
+             ((5, 6), (-2, )),
+             #((5, 6), ()),  #reduce on no axis(copy) isn't implemented
+             #((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
+             #((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
+             ((5, 0), None),
+             ((5, 0), (0, )),
+             ((5, 0), (1, )),
+             #((5, 0), ()), reduce on no axis isn't implemented
+             #((), None), reduce on no axis isn't implemented
+             #((), ()) reduce on no axis isn't implemented
+             #Test all GPU cases implemented
+             ((1,0),(1,)),
+             ((0,1),(1,)),
+             ((0,0),(1,)),
+             ((0,0,0),(1,2)),
+             ((0,0,0,0),(1,2,3)),
+             ((2,1),(1,)),
+             ((1,2),(1,)),
+             ((100,3,1300),[1]),
+             ((0,),[0]),((5,),[0]),
+             ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
+             ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
+             ((0,0,0,0),[0,1,2,3]),
+             ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+                               #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
+             ((4100,3),[0]),((3,4101),[0]),#10
+             ((1024,33),[0]),((33,1024),[0]),#10
+             ((1025,33),[0]),((33,1025),[0]),#10
+             ((4100,3),[1]),((3,4101),[1]),#01
+             ((1024,33),[1]),((33,1024),[1]),#01
+             ((1025,33),[1]),((33,1025),[1]),#01
+             ((4100,3),[0,1]),((3,4101),[0,1]),#11
+             ((1024,33),[0,1]),((33,1024),[0,1]),#01
+             ((1025,33),[0,1]),((33,1025),[0,1]),#01
+             ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]), ((3,65536,1), [0]),#100
+             ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
+             ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
+             ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
+             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
+             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
+             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
+             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
+             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
+             #test pattern implemented by reshape
+#             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
+#             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
+#             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
+#             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+#             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
+#             ((5,4,3,10,11),[1,2]),
+        ]
+    op = GpuCAReduceCuda
+    reds = [scalar.add, scalar.mul]
+    def test_perform(self):
+        return
+    def test_perform_nan(self):
+        return
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -3,7 +3,7 @@ import numpy
 import theano
 from theano.tests import unittest_tools as utt
 from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
-from theano.sandbox.gpuarray.elemwise import GpuCAReduce
+from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda
 import theano.sandbox.gpuarray
 from theano.tests.unittest_tools import SkipTest
@@ -69,7 +69,7 @@ def test_sum_prod():
        res = f(val)
        utt.assert_allclose(res, val.sum())
        assert res.shape == ()
-        assert GpuCAReduce in [type(node.op)
+        assert GpuCAReduceCuda in [type(node.op)
                                   for node in f.maker.fgraph.toposort()]

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -298,6 +298,23 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
    Py_XINCREF(%(oname)s);
 """, version=(0,))
+# Register GpuArrayType C code for Shape Op.
+theano.compile.register_shape_c_code(
+    GpuArrayType,
+    """
+    npy_intp shape[] = {%(iname)s->ga.nd};
+    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
+    {
+        Py_XDECREF(%(oname)s);
+        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
+    }
+    for(int i=0;i<shape[0];i++)
+    {
+        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = %(iname)s->ga.dimensions[i];
+    }
+    """,
+    version=1)
 theano.compile.register_shape_i_c_code(GpuArrayType, """
    if(!%(oname)s)
        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -25,6 +25,8 @@ from theano.gof.python25 import partial, any, all
 from theano.gof.utils import hashtype
 from theano import compile, printing
 from theano.printing import pprint, min_informative_str
+from theano.compile import Shape, shape  #For history
 # We use these exceptions as well.
 import theano.scalar.sharedvar
@@ -1125,83 +1127,6 @@ def cast(x, dtype):
 ##########################
-class Shape(Op):
-    """
-    L{Op} to return the shape of a matrix.
-    @note: Non-differentiable.
-    """
-    def __hash__(self):
-        return hash(type(self))
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x):
-        # Must work for all type that have a shape attribute.
-        # This will fail at execution time.
-        x = as_tensor_variable(x)
-        # Each type variable should implement their .shape attribute
-        # and have the fct infer_shape() implemented in the op that convert
-        # the type to TensorVariable to have the optimization working
-        # correctly.
-        return Apply(self, [x], [lvector()])
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        out[0] = theano._asarray(x.shape, dtype='int64')
-    def infer_shape(self, node, in_shapes):
-        return [[len(in_shapes[0])]]
-    def connection_pattern(self, node):
-        # the grad returns the gradient with respect to the
-        # elements of a tensor variable
-        # the elements of the tensor variable do not participate
-        # in the computation of the shape, so they are not really
-        # part of the graph
-        return [[False]]
-    def grad(self, inp, grads):
-        # the grad returns the gradient with respect to the
-        # elements of a tensor variable
-        # the elements of the tensor variable do not participate
-        # in the computation of the shape, so they are not really
-        # part of the graph
-        return [DisconnectedType()()]
-    def R_op(self, inputs, eval_points):
-        return [None]
-    def c_code(self, node, nodename, inp, out, sub):
-        x, = inp
-        z, = out
-        if isinstance(node.inputs[0].type, TensorType):
-            return """
-            npy_intp shape[] = {PyArray_NDIM(%(x)s)};
-            if(%(z)s == NULL || (PyArray_DIMS(%(z)s)[0] != shape[0]))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
-            }
-            for(int i=0;i<shape[0];i++)
-            {
-                ((npy_int64*)PyArray_GETPTR1(%(z)s, i))[0] = PyArray_DIMS(%(x)s)[i];
-            }
-            """ % locals()
-        else:
-            #TODO: if your type is not listed here, make a damn registry of
-            #      shape_i ops for various types of variables.
-            #      Do not continue this madness.
-            return super(Shape, self).c_code(node, nodename, (x,), (out,), sub)
-    def c_code_cache_version(self):
-        return (1,)
 @constructor
 def old_shape(a):
    """
@@ -1223,10 +1148,6 @@ def old_shape(a):
        # a tuple directly.  This tuple is like the numpy.ndarray.shape tuple.
        return va.type.shape
-shape = Shape()
-_shape = shape  # was used in the past, now use shape directly.
-pprint.assign(_shape, printing.MemberPrinter('shape'))
 class SpecifyShape(Op):
    """

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1398,7 +1398,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
            shape_subtensor = stop.owner
            if list(shape_subtensor.op.idx_list) == [0]:
                shape_var, = shape_subtensor.inputs
-                if shape_var.owner and shape_var.owner.op == tensor._shape:
+                if shape_var.owner and shape_var.owner.op == tensor.shape:
                    return shape_var.owner.inputs[0] is labels
        else:
            shape_of = stop.owner.fgraph.shape_feature.shape_of

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -284,10 +284,7 @@ class test_Broadcast(unittest.TestCase):
 class test_CAReduce(unittest_tools.InferShapeTester):
    op = CAReduce
+    cases = [((5, 6), None),
-    def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
-                    test_nan=False, tensor_op=None):
-        for xsh, tosum in [((5, 6), None),
             ((5, 6), (0, 1)),
             ((5, 6), (0, )),
             ((5, 6), (1, )),
@@ -301,7 +298,12 @@ class test_CAReduce(unittest_tools.InferShapeTester):
             ((5, 0), (1, )),
             ((5, 0), ()),
             ((), None),
-                           ((), ())]:
+             ((), ())
+         ]
+    def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
+                    test_nan=False, tensor_op=None):
+        for xsh, tosum in self.cases:
            if dtype == "floatX":
                dtype = theano.config.floatX
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
@@ -400,17 +402,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                if scalar_op in [scalar.and_, scalar.or_]:
                    zv = numpy.asarray(zv, dtype='int8')
                if test_nan:
-                    self.assertTrue(theano.tensor.TensorType.values_eq(f(xv),
+                    try:
-                                                                       zv),
+                        self.assertTrue(
+                            theano.tensor.TensorType.values_eq(f(xv), zv),
                            (f(xv), zv))
+                    except NotImplementedError:
+                        # GpuCAReduce don't implement all cases when size is 0
+                        assert xv.size == 0
                else:
+                    try:
                        f_xv = f(xv)
                        self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
                        self.assertTrue(numpy.allclose(f_xv, zv), (f_xv, zv))
+                    except NotImplementedError:
+                        # GpuCAReduce don't implement all cases when size is 0
+                        assert xv.size == 0
-            #test CAReduce.infer_shape
-            #the Shape op don't implement c_code!
-            if isinstance(linker, gof.PerformLinker):
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
            if tensor_op is None:
                e = self.op(scalar_op, axis=tosum)(x)
@@ -422,7 +429,11 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                                                  [e.shape])).make_function()
            if not(scalar_op in [scalar.maximum, scalar.minimum] and
                   ((xsh == () or numpy.prod(xsh) == 0))):
+                try:
                    assert all(f(xv) == zv.shape)
+                except NotImplementedError:
+                    # GpuCAReduce don't implement all cases when size is 0
+                    assert xv.size == 0
    def test_perform(self):
        for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
@@ -487,30 +498,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
            self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
                             test_nan=True)
-    def test_infer_shape(self):
+    def test_infer_shape(self, dtype=None):
-        for xsh, tosum in [((5, 6), None),
+        if dtype is None:
-                           ((5, 6), (0, 1)),
-                           ((5, 6), (0, )),
-                           ((5, 6), (1, )),
-                           ((5, 6), (-1, )),
-                           ((5, 6), (-2, )),
-                           ((2, 3, 4, 5), (0, 1, 3)),
-                           ((2, 3, 4, 5), (-2, -3)),
-                           ((5, 0), None),
-                           ((5, 0), (0, )),
-                           ((5, 0), (1, )),
-                           ((5, 6), ()),
-                           ((5, 0), ()),
-                           ((), None),
-                           ((), ())]:
            dtype = theano.config.floatX
+        for xsh, tosum in self.cases:
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
            if tosum is None:
                tosum = range(len(xsh))
            xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
            self._compile_and_check([x],
                                    [self.op(scalar.add, axis=tosum)(x)],
-                            [xv], self.op, ["local_cut_useless_reduce"])
+                                    [xv], self.op,
+                                    ["local_cut_useless_reduce"],
+                                    warn=0 not in xsh)
 class test_Prod(unittest.TestCase):

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -611,6 +611,25 @@ theano.compile.register_view_op_c_code(
        """,
        version=1)
+# Register TensorType C code for Shape Op.
+theano.compile.register_shape_c_code(
+    TensorType,
+    """
+    npy_intp shape[] = {PyArray_NDIM(%(iname)s)};
+    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
+    {
+        Py_XDECREF(%(oname)s);
+        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
+    }
+    for(int i=0;i<shape[0];i++)
+    {
+        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = PyArray_DIMS(%(iname)s)[i];
+    }
+    """,
+    version=1)
 # Register TensorType C code for ViewOp.
 theano.compile.register_shape_i_c_code(
        TensorType,