Merge pull request #1710 from nouiz/gpu_reduce

new Gpu reduce based on the version in Theano that is much faster.

Merge pull request #1710 from nouiz/gpu_reduce
06cdb1fa · abergeron · 262f59a2 · 1121fe45 · 06cdb1fa · 06cdb1fa
--- a/doc/extending/ctype.txt
+++ b/doc/extending/ctype.txt
@@ -553,3 +553,20 @@ respectively. See an example for the type ``CudaNdarrayType`` (GPU
 array) in the file `theano/sandbox/cuda/type.py`. The version
 parameter is what is returned by ViewOp.c_code_cache_version(). By
 default, it will recompile the c code for each process.
+Shape and Shape_i
+=================
+We have 2 generic Ops Shape and Shape_i that return the shape of any
+Theano Variable that have a shape attribute and Shape_i return only of
+the element of the shape.
+.. code-block:: python
+   theano.compile.ops.register_shape_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
+   theano.compile.ops.register_shape_i_c_code(YOUR_TYPE_CLASS, THE_C_CODE, version=())
+The c code work as the ViewOp. Shape_i have the additional i parameter
+that you can use with %(i)s.
--- a/theano/compile/__init__.py
+++ b/theano/compile/__init__.py
 from theano.compile.ops import (
        DeepCopyOp, deep_copy_op, register_deep_copy_op_c_code,
+        Shape, shape, register_shape_c_code,
        Shape_i, register_shape_i_c_code,
        ViewOp, view_op, register_view_op_c_code)

--- a/theano/compile/ops.py
+++ b/theano/compile/ops.py
@@ -181,6 +181,94 @@ class DeepCopyOp(gof.Op):
 deep_copy_op = DeepCopyOp()
+def register_shape_c_code(type, code, version=()):
+    """ Tell Shape Op how to generate C code for a Theano Type
+    :param typ: A Theano type. It must be the Theano class itself and not an
+                instance of the class.
+    :param code: C code that deep copies the Theano type 'typ'.
+                 Use %(iname)s and %(oname)s for the input and output C
+                 variable names respectively.
+    :param version: A number indicating the version of the code, for cache.
+    """
+    Shape.c_code_and_version[type] = (code, version)
+class Shape(gof.Op):
+    """
+    L{Op} to return the shape of a matrix.
+    @note: Non-differentiable.
+    """
+    # Mapping from Type to C code (and version) to use.
+    # In the C code, the name of the input variable is %(iname)s,
+    # the output variable is %(oname)s.
+    c_code_and_version = {}
+    def __hash__(self):
+        return hash(type(self))
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __str__(self):
+        return self.__class__.__name__
+    def make_node(self, x):
+        # Must work for all type that have a shape attribute.
+        # This will fail at execution time.
+        if not isinstance(x, theano.Variable):
+            x = theano.tensor.as_tensor_variable(x)
+        return gof.Apply(self, [x], [theano.tensor.lvector()])
+    def perform(self, node, inp, out_):
+        x, = inp
+        out, = out_
+        out[0] = theano._asarray(x.shape, dtype='int64')
+    def infer_shape(self, node, in_shapes):
+        return [[len(in_shapes[0])]]
+    def connection_pattern(self, node):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [[False]]
+    def grad(self, inp, grads):
+        # the grad returns the gradient with respect to the
+        # elements of a tensor variable
+        # the elements of the tensor variable do not participate
+        # in the computation of the shape, so they are not really
+        # part of the graph
+        return [DisconnectedType()()]
+    def R_op(self, inputs, eval_points):
+        return [None]
+    def c_code(self, node, name, inames, onames, sub):
+        iname, = inames
+        oname, = onames
+        fail = sub['fail']
+        itype = node.inputs[0].type.__class__
+        if itype in self.c_code_and_version:
+            code, version = self.c_code_and_version[itype]
+            return code % locals()
+        # Else, no C code
+        return super(Shape, self).c_code(node, name, inames, onames, sub)
+    def c_code_cache_version(self):
+        return (1,)
+shape = Shape()
+_shape = shape  # was used in the past, now use shape directly.
+#pprint.assign(_shape, printing.MemberPrinter('shape'))
 class Shape_i(gof.Op):
    """
    L{Op} to return the shape of a matrix.

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -523,21 +523,21 @@ class GpuConv(GpuOp):
            imshp=None,
            max_threads_dim0=None):
        """
-        :param version: each version of c_code implement many kernel for the
+        :param version: each version of c_code implements many kernel for the
                        convolution. By default we try to guess the best one.
                        You can force one version with this parameter. This
                        parameter is used by the tests.
        :param verbose: for value of 1,2 and 3. Print more information during
                        the execution of the convolution. Mostly used for
                        optimization or debugging.
-        :param kshp:    The size of the kernel. If provided, can genera
+        :param kshp:    The size of the kernel. If provided, can generate
                        faster code. If the GpuConv op is automatically
                        inserted,
                        we take its value automatically from the Conv op.
        :param imshp:   The size of the image. Not used for code generation but
-                        allow to select an experimental new version in another
+                        allows to select an experimental new version in another
                        repo.
-        :param max_threads_dim0: The maximum number of thread for the
+        :param max_threads_dim0: The maximum number of threads for the
                        block size dimensions 0 (blockDim.x) used by the
                        GPU function.

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
-// REMEMBER TO RAISE c_code_cache_version when changing this file
+// REMEMBER TO INCREASE c_code_cache_version when changing this file
 //
 enum { ConvMode_FULL, ConvMode_VALID };
 PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
-// REMEMBER TO RAISE c_code_cache_version when changing this file
+// REMEMBER TO INCREASE c_code_cache_version when changing this file
 //
 //implement the valid convolution only

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -654,15 +654,6 @@ class GpuAlloc(HideC, Alloc):
 gpu_alloc = GpuAlloc()
-class GpuShape(HideC, tensor.Shape):
-    """
-    Implement Shape on the gpu.
-    """
-    def make_node(self, x):
-        return Apply(self, [x], [tensor.lvector()])
-gpu_shape = GpuShape()
 class GpuReshape(HideC, tensor.Reshape):
    """
    Implement Reshape on the gpu.

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -14,9 +14,7 @@ from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               gpu_from_host,
                                               gpu_alloc,
-                                               gpu_shape,
                                               GpuAlloc,
-                                               GpuShape,
                                               GpuReshape,
                                               GpuEye)
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
@@ -24,7 +22,7 @@ from theano.sandbox.gpuarray.conv import GpuConv
 from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                                          GpuCrossentropySoftmax1HotWithBiasDx)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
-                                              GpuDimShuffle, GpuCAReduce)
+                                              GpuDimShuffle, GpuCAReduceCuda)
 from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
 from theano.sandbox.gpuarray.type import GpuArrayConstant
@@ -249,9 +247,57 @@ def local_gpua_incsubtensor(node):
 def local_gpua_careduce(node):
    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
        isinstance(node.op.scalar_op, scalar.basic.Mul)):
-        return GpuCAReduce(node.op.scalar_op, axis=node.op.axis,
+        x, = node.inputs
-                           dtype=getattr(node.op, 'dtype', None),
+        greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis)
-                           acc_dtype=getattr(node.op, 'acc_dtype', None))
+        if x.dtype != "float32":
+            return
+        gvar = greduce(x)
+        #We need to have the make node called, otherwise the mask can
+        #be None
+        if gvar.owner.op.supports_c_code([gpu_from_host(x)]):
+            return greduce
+        else:
+            # Try to make a simpler pattern based on reshaping
+            # The principle is that if two adjacent dimensions have
+            # the same value in the reduce_mask, then we can reshape
+            # to make them a single dimension, do the reduction, and
+            # then reshape to get them back.
+            if node.op.axis is None:
+                reduce_mask = [1] * x.type.ndim
+            else:
+                reduce_mask = [0] * x.type.ndim
+                for a in node.op.axis:
+                    assert reduce_mask[a] == 0
+                    reduce_mask[a] = 1
+            shape_of = node.fgraph.shape_feature.shape_of
+            x_shape = shape_of[x]
+            new_in_shp = [x_shape[0]]
+            new_mask = [reduce_mask[0]]
+            for i in xrange(1, x.type.ndim):
+                if reduce_mask[i] == reduce_mask[i - 1]:
+                    new_in_shp[-1] *= x_shape[i]
+                else:
+                    new_mask.append(reduce_mask[i])
+                    new_in_shp.append(x_shape[i])
+            new_greduce = GpuCAReduceCuda(new_mask, scalar_op)
+            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
+            gpu_reshaped_x = gpu_from_host(reshaped_x)
+            reshaped_gpu_inputs = [gpu_reshaped_x]
+            if new_greduce.supports_c_code(reshaped_gpu_inputs):
+                reduce_reshaped_x = host_from_gpu(
+                    new_greduce(gpu_reshaped_x))
+                if reduce_reshaped_x.ndim != node.outputs[0].ndim:
+                    unreshaped_reduce = reduce_reshaped_x.reshape(
+                        tensor.stack(*shape_of[node.outputs[0]]))
+                else:
+                    unreshaped_reduce = reduce_reshaped_x
+                return [unreshaped_reduce]
 @register_opt()
@@ -296,20 +342,6 @@ def local_gpua_crossentropysoftmax1hotwithbiasdx(node):
    return GpuCrossentropySoftmax1HotWithBiasDx()
-@register_opt()
-@local_optimizer([tensor.Shape])
-def local_gpua_shape(node):
-    """
-    Can't use op_lifter as the output is on the GPU.
-    """
-    if isinstance(node.op, tensor.Shape):
-        x, = node.inputs
-        if x.owner and x.owner.op == host_from_gpu:
-            gpu_x, = x.owner.inputs
-            return [gpu_shape(gpu_x)]
-    return False
 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])
 def local_gpu_conv(node):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -36,7 +36,7 @@ from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
                                               gpu_alloc, gpu_from_cuda,
                                               cuda_from_gpu, HostFromGpu,
                                               GpuFromHost, GpuReshape,
-                                               GpuEye, GpuShape)
+                                               GpuEye)
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -307,7 +307,7 @@ def test_shape():
    topo = f.maker.fgraph.toposort()
    assert numpy.all(f(v) == (3, 4, 5))
    assert len(topo) == 1
-    assert isinstance(topo[0].op, GpuShape)
+    assert isinstance(topo[0].op, T.Shape)
 class G_reshape(T_reshape):

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -10,7 +10,7 @@ from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
 from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
-                                              GpuCAReduce)
+                                              GpuCAReduceCuda, GpuCAReduceCPY)
 from theano.sandbox.gpuarray.type import GpuArrayType
 from pygpu.array import gpuarray
@@ -37,10 +37,11 @@ class test_gpu_Broadcast(test_Broadcast):
 class test_GpuDimShuffle(test_DimShuffle):
    op = GpuDimShuffle
-class test_GpuCAReduce(test_CAReduce):
+class test_GpuCAReduceCPY(test_CAReduce):
    dtypes = ["float32"]
    bin_dtypes = ["uint8", "int8"]
-    op = GpuCAReduce
+    op = GpuCAReduceCPY
    reds = [scalar.add, scalar.mul]
    def test_perform(self):
@@ -64,3 +65,87 @@ class test_GpuCAReduce(test_CAReduce):
            for op in self.reds:
                self.with_linker(gof.CLinker(), op, dtype=dtype,
                                 test_nan=True)
+    def test_infer_shape(self):
+        for dtype in self.dtypes:
+            test_CAReduce.test_infer_shape(self, dtype)
+class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
+    dtypes = ["float32"]
+    bin_dtypes = ["uint8", "int8"]
+    bin_dtypes = []
+    cases = [((5, 6), None),
+             ((5, 6), (0, 1)),
+             ((5, 6), (0, )),
+             ((5, 6), (1, )),
+             ((5, 6), (-1, )),
+             ((5, 6), (-2, )),
+             #((5, 6), ()),  #reduce on no axis(copy) isn't implemented
+             #((2, 3, 4, 5), (0, 1, 3)), mask 1101 isn't implemented
+             #((2, 3, 4, 5), (-2, -3)), mask 0110 isn't implemented
+             ((5, 0), None),
+             ((5, 0), (0, )),
+             ((5, 0), (1, )),
+             #((5, 0), ()), reduce on no axis isn't implemented
+             #((), None), reduce on no axis isn't implemented
+             #((), ()) reduce on no axis isn't implemented
+             #Test all GPU cases implemented
+             ((1,0),(1,)),
+             ((0,1),(1,)),
+             ((0,0),(1,)),
+             ((0,0,0),(1,2)),
+             ((0,0,0,0),(1,2,3)),
+             ((2,1),(1,)),
+             ((1,2),(1,)),
+             ((100,3,1300),[1]),
+             ((0,),[0]),((5,),[0]),
+             ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
+             ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
+             ((0,0,0,0),[0,1,2,3]),
+             ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+                               #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enough thread/block in each dimensions
+             ((4100,3),[0]),((3,4101),[0]),#10
+             ((1024,33),[0]),((33,1024),[0]),#10
+             ((1025,33),[0]),((33,1025),[0]),#10
+             ((4100,3),[1]),((3,4101),[1]),#01
+             ((1024,33),[1]),((33,1024),[1]),#01
+             ((1025,33),[1]),((33,1025),[1]),#01
+             ((4100,3),[0,1]),((3,4101),[0,1]),#11
+             ((1024,33),[0,1]),((33,1024),[0,1]),#01
+             ((1025,33),[0,1]),((33,1025),[0,1]),#01
+             ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]), ((3,65536,1), [0]),#100
+             ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
+             ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
+             ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
+             ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
+             #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+             ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+             ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
+             ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
+             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
+             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
+             #test pattern implemented by reshape
+#             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
+#             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
+#             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
+#             ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
+#             ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111
+#             ((5,4,3,10,11),[1,2]),
+        ]
+    op = GpuCAReduceCuda
+    reds = [scalar.add, scalar.mul]
+    def test_perform(self):
+        return
+    def test_perform_nan(self):
+        return
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -3,7 +3,7 @@ import numpy
 import theano
 from theano.tests import unittest_tools as utt
 from theano.sandbox.gpuarray.basic_ops import GpuAlloc, GpuReshape, gpu_alloc
-from theano.sandbox.gpuarray.elemwise import GpuCAReduce
+from theano.sandbox.gpuarray.elemwise import GpuCAReduceCuda
 import theano.sandbox.gpuarray
 from theano.tests.unittest_tools import SkipTest
@@ -69,8 +69,8 @@ def test_sum_prod():
        res = f(val)
        utt.assert_allclose(res, val.sum())
        assert res.shape == ()
-        assert GpuCAReduce in [type(node.op)
+        assert GpuCAReduceCuda in [type(node.op)
-                               for node in f.maker.fgraph.toposort()]
+                                   for node in f.maker.fgraph.toposort()]
 def test_local_gpualloc_memset_0():

--- a/theano/sandbox/gpuarray/type.py
+++ b/theano/sandbox/gpuarray/type.py
@@ -298,6 +298,23 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
    Py_XINCREF(%(oname)s);
 """, version=(0,))
+# Register GpuArrayType C code for Shape Op.
+theano.compile.register_shape_c_code(
+    GpuArrayType,
+    """
+    npy_intp shape[] = {%(iname)s->ga.nd};
+    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
+    {
+        Py_XDECREF(%(oname)s);
+        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
+    }
+    for(int i=0;i<shape[0];i++)
+    {
+        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = %(iname)s->ga.dimensions[i];
+    }
+    """,
+    version=1)
 theano.compile.register_shape_i_c_code(GpuArrayType, """
    if(!%(oname)s)
        %(oname)s=(PyArrayObject*)PyArray_ZEROS(0, NULL, NPY_INT64, 0);

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -25,6 +25,8 @@ from theano.gof.python25 import partial, any, all
 from theano.gof.utils import hashtype
 from theano import compile, printing
 from theano.printing import pprint, min_informative_str
+from theano.compile import Shape, shape  #For history
 # We use these exceptions as well.
 import theano.scalar.sharedvar
@@ -1125,83 +1127,6 @@ def cast(x, dtype):
 ##########################
-class Shape(Op):
-    """
-    L{Op} to return the shape of a matrix.
-    @note: Non-differentiable.
-    """
-    def __hash__(self):
-        return hash(type(self))
-    def __eq__(self, other):
-        return type(self) == type(other)
-    def __str__(self):
-        return self.__class__.__name__
-    def make_node(self, x):
-        # Must work for all type that have a shape attribute.
-        # This will fail at execution time.
-        x = as_tensor_variable(x)
-        # Each type variable should implement their .shape attribute
-        # and have the fct infer_shape() implemented in the op that convert
-        # the type to TensorVariable to have the optimization working
-        # correctly.
-        return Apply(self, [x], [lvector()])
-    def perform(self, node, inp, out_):
-        x, = inp
-        out, = out_
-        out[0] = theano._asarray(x.shape, dtype='int64')
-    def infer_shape(self, node, in_shapes):
-        return [[len(in_shapes[0])]]
-    def connection_pattern(self, node):
-        # the grad returns the gradient with respect to the
-        # elements of a tensor variable
-        # the elements of the tensor variable do not participate
-        # in the computation of the shape, so they are not really
-        # part of the graph
-        return [[False]]
-    def grad(self, inp, grads):
-        # the grad returns the gradient with respect to the
-        # elements of a tensor variable
-        # the elements of the tensor variable do not participate
-        # in the computation of the shape, so they are not really
-        # part of the graph
-        return [DisconnectedType()()]
-    def R_op(self, inputs, eval_points):
-        return [None]
-    def c_code(self, node, nodename, inp, out, sub):
-        x, = inp
-        z, = out
-        if isinstance(node.inputs[0].type, TensorType):
-            return """
-            npy_intp shape[] = {PyArray_NDIM(%(x)s)};
-            if(%(z)s == NULL || (PyArray_DIMS(%(z)s)[0] != shape[0]))
-            {
-                Py_XDECREF(%(z)s);
-                %(z)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
-            }
-            for(int i=0;i<shape[0];i++)
-            {
-                ((npy_int64*)PyArray_GETPTR1(%(z)s, i))[0] = PyArray_DIMS(%(x)s)[i];
-            }
-            """ % locals()
-        else:
-            #TODO: if your type is not listed here, make a damn registry of
-            #      shape_i ops for various types of variables.
-            #      Do not continue this madness.
-            return super(Shape, self).c_code(node, nodename, (x,), (out,), sub)
-    def c_code_cache_version(self):
-        return (1,)
 @constructor
 def old_shape(a):
    """
@@ -1223,10 +1148,6 @@ def old_shape(a):
        # a tuple directly.  This tuple is like the numpy.ndarray.shape tuple.
        return va.type.shape
-shape = Shape()
-_shape = shape  # was used in the past, now use shape directly.
-pprint.assign(_shape, printing.MemberPrinter('shape'))
 class SpecifyShape(Op):
    """

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1398,7 +1398,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
            shape_subtensor = stop.owner
            if list(shape_subtensor.op.idx_list) == [0]:
                shape_var, = shape_subtensor.inputs
-                if shape_var.owner and shape_var.owner.op == tensor._shape:
+                if shape_var.owner and shape_var.owner.op == tensor.shape:
                    return shape_var.owner.inputs[0] is labels
        else:
            shape_of = stop.owner.fgraph.shape_feature.shape_of

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -284,24 +284,26 @@ class test_Broadcast(unittest.TestCase):
 class test_CAReduce(unittest_tools.InferShapeTester):
    op = CAReduce
+    cases = [((5, 6), None),
+             ((5, 6), (0, 1)),
+             ((5, 6), (0, )),
+             ((5, 6), (1, )),
+             ((5, 6), (-1, )),
+             ((5, 6), (-2, )),
+             ((5, 6), ()),
+             ((2, 3, 4, 5), (0, 1, 3)),
+             ((2, 3, 4, 5), (-2, -3)),
+             ((5, 0), None),
+             ((5, 0), (0, )),
+             ((5, 0), (1, )),
+             ((5, 0), ()),
+             ((), None),
+             ((), ())
+         ]
    def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
                    test_nan=False, tensor_op=None):
-        for xsh, tosum in [((5, 6), None),
+        for xsh, tosum in self.cases:
-                           ((5, 6), (0, 1)),
-                           ((5, 6), (0, )),
-                           ((5, 6), (1, )),
-                           ((5, 6), (-1, )),
-                           ((5, 6), (-2, )),
-                           ((5, 6), ()),
-                           ((2, 3, 4, 5), (0, 1, 3)),
-                           ((2, 3, 4, 5), (-2, -3)),
-                           ((5, 0), None),
-                           ((5, 0), (0, )),
-                           ((5, 0), (1, )),
-                           ((5, 0), ()),
-                           ((), None),
-                           ((), ())]:
            if dtype == "floatX":
                dtype = theano.config.floatX
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
@@ -400,29 +402,38 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                if scalar_op in [scalar.and_, scalar.or_]:
                    zv = numpy.asarray(zv, dtype='int8')
                if test_nan:
-                    self.assertTrue(theano.tensor.TensorType.values_eq(f(xv),
+                    try:
-                                                                       zv),
+                        self.assertTrue(
-                                    (f(xv), zv))
+                            theano.tensor.TensorType.values_eq(f(xv), zv),
+                            (f(xv), zv))
+                    except NotImplementedError:
+                        # GpuCAReduce don't implement all cases when size is 0
+                        assert xv.size == 0
                else:
-                    f_xv = f(xv)
+                    try:
-                    self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
+                        f_xv = f(xv)
-                    self.assertTrue(numpy.allclose(f_xv, zv), (f_xv, zv))
+                        self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
+                        self.assertTrue(numpy.allclose(f_xv, zv), (f_xv, zv))
+                    except NotImplementedError:
+                        # GpuCAReduce don't implement all cases when size is 0
+                        assert xv.size == 0
-            #test CAReduce.infer_shape
+            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
-            #the Shape op don't implement c_code!
+            if tensor_op is None:
-            if isinstance(linker, gof.PerformLinker):
+                e = self.op(scalar_op, axis=tosum)(x)
-                x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
+            else:
-                if tensor_op is None:
+                e = tensor_op(x, axis=tosum)
-                    e = self.op(scalar_op, axis=tosum)(x)
+            if tosum is None:
-                else:
+                tosum = range(len(xsh))
-                    e = tensor_op(x, axis=tosum)
+            f = copy(linker).accept(FunctionGraph([x],
-                if tosum is None:
+                                                  [e.shape])).make_function()
-                    tosum = range(len(xsh))
+            if not(scalar_op in [scalar.maximum, scalar.minimum] and
-                f = copy(linker).accept(FunctionGraph([x],
+                   ((xsh == () or numpy.prod(xsh) == 0))):
-                     [e.shape])).make_function()
+                try:
-                if not(scalar_op in [scalar.maximum, scalar.minimum] and
-                       ((xsh == () or numpy.prod(xsh) == 0))):
                    assert all(f(xv) == zv.shape)
+                except NotImplementedError:
+                    # GpuCAReduce don't implement all cases when size is 0
+                    assert xv.size == 0
    def test_perform(self):
        for dtype in ["floatX", "complex64", "complex128", "int8", "uint8"]:
@@ -487,30 +498,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
            self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
                             test_nan=True)
-    def test_infer_shape(self):
+    def test_infer_shape(self, dtype=None):
-        for xsh, tosum in [((5, 6), None),
+        if dtype is None:
-                           ((5, 6), (0, 1)),
-                           ((5, 6), (0, )),
-                           ((5, 6), (1, )),
-                           ((5, 6), (-1, )),
-                           ((5, 6), (-2, )),
-                           ((2, 3, 4, 5), (0, 1, 3)),
-                           ((2, 3, 4, 5), (-2, -3)),
-                           ((5, 0), None),
-                           ((5, 0), (0, )),
-                           ((5, 0), (1, )),
-                           ((5, 6), ()),
-                           ((5, 0), ()),
-                           ((), None),
-                           ((), ())]:
            dtype = theano.config.floatX
+        for xsh, tosum in self.cases:
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
            if tosum is None:
                tosum = range(len(xsh))
            xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
            self._compile_and_check([x],
-                            [self.op(scalar.add, axis=tosum)(x)],
+                                    [self.op(scalar.add, axis=tosum)(x)],
-                            [xv], self.op, ["local_cut_useless_reduce"])
+                                    [xv], self.op,
+                                    ["local_cut_useless_reduce"],
+                                    warn=0 not in xsh)
 class test_Prod(unittest.TestCase):

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -611,6 +611,25 @@ theano.compile.register_view_op_c_code(
        """,
        version=1)
+# Register TensorType C code for Shape Op.
+theano.compile.register_shape_c_code(
+    TensorType,
+    """
+    npy_intp shape[] = {PyArray_NDIM(%(iname)s)};
+    if(%(oname)s == NULL || (PyArray_DIMS(%(oname)s)[0] != shape[0]))
+    {
+        Py_XDECREF(%(oname)s);
+        %(oname)s = (PyArrayObject*) PyArray_SimpleNew(1, shape, NPY_INT64);
+    }
+    for(int i=0;i<shape[0];i++)
+    {
+        ((npy_int64*)PyArray_GETPTR1(%(oname)s, i))[0] = PyArray_DIMS(%(iname)s)[i];
+    }
+    """,
+    version=1)
 # Register TensorType C code for ViewOp.
 theano.compile.register_shape_i_c_code(
        TensorType,