Merge pull request #1835 from nouiz/gpureduce

Gpureduce: support multiple dtype, prod, max and min

Merge pull request #1835 from nouiz/gpureduce
9ad79667 · abergeron · 43a86c9e · 254dd8b7 · 9ad79667 · 9ad79667
--- a/theano/compile/pfunc.py
+++ b/theano/compile/pfunc.py
@@ -364,8 +364,7 @@ def pfunc(params, outputs=None, mode=None, updates=None, givens=None,
    that are neither in "updates" nor in "no_default_updates".

    :type name: None or string
-    :param name: attaches a name to the Profiling result of this function when
-    using ProfileMode (will be deprecated).
+    :param name: attaches a name to the profiling result of this function.

    :type allow_input_downcast: Boolean
    :param allow_input_downcast: True means that the values passed as

--- a/theano/gof/link.py
+++ b/theano/gof/link.py
@@ -258,7 +258,7 @@ class Container(object):
        """WRITEME

        :Parameters:
-         `r`: a variable
+         `r`: a Variable or a Type
         `storage`: a list of length 1, whose element is the value for `r`
         `readonly`: True indicates that this should not be setable by Function[r] = val
         `strict`: if True, we don't allow type casting.

--- a/theano/misc/check_blas.py
+++ b/theano/misc/check_blas.py
@@ -215,7 +215,7 @@ if __name__ == "__main__":
        C1060                                                0.46s

        GTX Titan(D15U-50)0.06s  0.06s  don't work
-        GTX 680                  0.12s  0.154s               0.218s
+        GTX 680           0.11s  0.12s  0.154s               0.218s
        GTX 580           0.16s  0.16s  0.164s               0.203s
        GTX 480           0.19s  0.19s  0.192s               0.237s 0.27s
        GTX 470           0.23s  0.23s  0.238s               0.297s 0.34s

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -442,7 +442,7 @@ def local_gpu_lazy_ifelse(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas._dot22])
+@local_optimizer([gpu_from_host, tensor.blas.Dot22])
 def local_gpu_dot22(node):
    """
    gpu_from_host(dot22) -> gpudot(gpu_from_host)
@@ -465,7 +465,7 @@ def local_gpu_dot22(node):


 @register_opt()
-@local_optimizer([gpu_from_host, tensor.blas._dot22scalar])
+@local_optimizer([gpu_from_host, tensor.blas.Dot22Scalar])
 def local_gpu_dot22scalar(node):
    """
    gpu_from_host(dot22scalar) -> gpudot(gpu_from_host)
@@ -571,7 +571,7 @@ def local_gpu_ger(node):


 @register_opt()
-@local_optimizer([tensor.blas.gemm_no_inplace, gpu_from_host])
+@local_optimizer([tensor.blas.Gemm, gpu_from_host])
 def local_gpu_gemm(node):
    """
    gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -3,11 +3,13 @@ from itertools import izip
 from StringIO import StringIO

 import numpy
-from theano import Op, Apply, scalar, config
+
+import theano
+from theano import Apply, scalar, config
 from theano import scalar as scal
 from theano.scalar import Scalar
 from theano.tensor.elemwise import (Elemwise, DimShuffle,
-                                    CAReduce, CAReduceDtype)
+                                    CAReduceDtype)
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

 try:
@@ -74,12 +76,8 @@ class GpuElemwise(HideC, Elemwise):

        # Try to generate the kernel to catch SupportCodeErrors
        try:
-            inps = [make_argument(i, 'i%d' % (n,)) for n, i in
-                    enumerate(node.inputs)]
            scal_ins = [scalar.get_scalar_type(i.dtype) for i in node.inputs]

-            outs = [make_argument(o, 'o%d' % (n,)) for n, o in
-                    enumerate(node.outputs) if not n in self.inplace_pattern]
            scal_out = [scalar.get_scalar_type(o.dtype) for o in node.outputs]

            fake_node = Apply(self.scalar_op, [i() for i in scal_ins],
@@ -402,7 +400,7 @@ class GpuElemwise(HideC, Elemwise):
                param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
        code += ',\n'.join(param) + ");\n"
        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
+            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
        return str(code)

    def perform(self, node, inputs, output_storage):
@@ -540,7 +538,7 @@ class GpuDimShuffle(HideC, DimShuffle):
        return (4,)


-class GpuCAReduceCuda(HideC, CAReduce):
+class GpuCAReduceCuda(HideC, CAReduceDtype):
    """GpuCAReduceCuda is a Reduction along some dimensions by a scalar op.

    The dimensions along which to reduce is specified by the
@@ -575,7 +573,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
    """

    def __init__(self, scalar_op, axis=None,
-                 reduce_mask=None):
+                 reduce_mask=None, dtype=None, acc_dtype=None):
        if reduce_mask is not None:
            reduce_mask = tuple(reduce_mask)
        self.reduce_mask = reduce_mask
@@ -583,20 +581,23 @@ class GpuCAReduceCuda(HideC, CAReduce):
        # used to make sure that calls to scalar op
        # have unique name arguments
        self._n_scalar_op_calls = 0
-        if not hasattr(scalar_op, 'identity'):
-            raise ValueError("No identity on scalar op")
-        CAReduce.__init__(self, scalar_op, axis=axis)
+        CAReduceDtype.__init__(self, scalar_op, axis=axis,
+                               dtype=dtype, acc_dtype=acc_dtype)

    def __eq__(self, other):
        return (type(self) == type(other) and
                self.axis == other.axis and
                self.reduce_mask == other.reduce_mask and
+                self.dtype == other.dtype and
+                self.acc_dtype == other.acc_dtype and
                self.scalar_op == other.scalar_op)

    def __hash__(self):
        return (hash(type(self)) ^
                hash(self.axis) ^
                hash(self.reduce_mask) ^
+                hash(self.dtype) ^
+                hash(self.acc_dtype) ^
                hash(type(self.scalar_op)))

    def __str__(self):
@@ -607,7 +608,6 @@ class GpuCAReduceCuda(HideC, CAReduce):

    def make_node(self, x):
        x = as_gpuarray_variable(x)
-        assert x.dtype == "float32"
        ret = super(GpuCAReduceCuda, self).make_node(x)
        self = copy.copy(self)
        self.axis = ret.op.axis
@@ -623,7 +623,7 @@ class GpuCAReduceCuda(HideC, CAReduce):

        if (x.type.ndim != len(self.reduce_mask)):
            raise TypeError("x must have rank %i" % len(self.reduce_mask))
-        return Apply(self, [x], [GpuArrayType(x.dtype,
+        return Apply(self, [x], [GpuArrayType(ret.outputs[0].dtype,
                                              ret.outputs[0].type.broadcastable)()])

    """
@@ -693,7 +693,8 @@ class GpuCAReduceCuda(HideC, CAReduce):

        nd_in = node.inputs[0].type.ndim
        nd_out = node.outputs[0].type.ndim
-
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
        assert nd_in - nd_out == sum(self.reduce_mask)

        sio = StringIO()
@@ -757,7 +758,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
            if not self.reduce_mask[i]:
                print >> sio, 'new_dims[%(j)s] = PyGpuArray_DIMS(%(x)s)[%(i)s];' % locals()
                j += 1
-        out_typecode = dtype_to_typecode(node.outputs[0].dtype)
+        out_typecode = dtype_to_typecode(out_dtype[4:])
        print >> sio, """
            Py_XDECREF(%(z)s);
            %(z)s = pygpu_empty(%(nd_out)s, new_dims,
@@ -775,7 +776,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        # \begin bracket the reduction in a check that there is
        # actually work to do
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
+            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            scalar_op = self.scalar_op
@@ -827,20 +828,20 @@ class GpuCAReduceCuda(HideC, CAReduce):

                if (verbose)
                    printf("running kernel_reduce_10_%(name)s\\n");
-                int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+                int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
                kernel_reduce_10_%(name)s<<<n_blocks, n_threads,
                                                n_shared>>>(
                        PyGpuArray_DIMS(%(x)s)[0],
                        PyGpuArray_DIMS(%(x)s)[1],
-                        (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/4,
-                        PyGpuArray_STRIDES(%(x)s)[1]/4,
-                        (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/4
+                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
+                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
+                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
                        );
                [
        if config.gpuarray.sync:
-            code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz)
+            code += "GpuArray_sync(&%(z)s->ga);\n" % dict(z=z)
                ]
                if (cudaSuccess != cudaGetLastError())
                {
@@ -848,6 +849,9 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    %(fail)s;
                }
        """
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
        sio = StringIO()
        if pattern is None:
            pattern = ''.join(str(c) for c in self.reduce_mask)
@@ -860,7 +864,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        print >> sio, """
            if (verbose)
                printf("running kernel_reduce_%(pattern)s_%(name)s\\n");
-            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x * n_threads.y * n_threads.z;
            if (verbose>1)
                printf("n_threads.x=%%d, n_threads.y=%%d, n_threads.z=%%d,"
                       " nb_threads=%%d, n_blocks.x=%%d, n_blocks.y=%%d,"
@@ -876,18 +880,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    PyGpuArray_DIMS(%(x)s)[%(i)s],
            """ % locals()
        print >> sio, """
-                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
+                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset)
            """ % locals()
        for i in xrange(ndim):
            print >> sio, """
-                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/4
+                    ,PyGpuArray_STRIDES(%(x)s)[%(i)s]/sizeof(%(in_dtype)s)
            """ % locals()
        print >> sio, """
-                    ,(float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
+                    ,(%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset)
            """ % locals()
        for i in xrange(nd_out):
            print >> sio, """
-                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/4
+                    ,PyGpuArray_STRIDES(%(z)s)[%(i)s]/sizeof(%(out_dtype)s)
            """ % locals()
        sync = ""
        if config.gpuarray.sync:
@@ -927,17 +931,19 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A,
+                    const %(in_dtype)s *A,
                    const int sA0,
                    const int sA1,
                    const int sA2,
-                    float * Z,
+                    %(out_dtype)s * Z,
                    const int sZ0)

            Since the nodename is unique, we don't need to put the name
            of the scalar_op in here.

        """
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
        if reduce_mask is None:
            reduce_mask = self.reduce_mask
        if ndim is None:
@@ -954,14 +960,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d%(i)s,
        """ % locals()
        print >> sio, """
-                    const float *A,
+                    const %(in_dtype)s *A,
        """ % locals()
        for i in xrange(ndim):
            print >> sio, """
                    const int sA%(i)s,
        """ % locals()
        print >> sio, """
-                    float * Z
+                    %(out_dtype)s * Z
        """ % locals()
        for i in xrange(ndim - sum(reduce_mask)):
            print >> sio, """
@@ -970,13 +976,15 @@ class GpuCAReduceCuda(HideC, CAReduce):
        print >> sio, ")"
        return sio.getvalue()

-    def _k_init(self, *args):
+    def _k_init(self, node, nodename):
+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
+
        return """
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
                + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = 0.0f;
+                extern __shared__ %(acc_dtype)s buf[];
+                %(acc_dtype)s myresult = 0;

                //This is caught in cuda/init.py when we init the gpu. I keep
                //it here to ease finding code that rely on this.
@@ -986,7 +994,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    return;
                }

-        """
+        """ % locals()

    def _assign_init(self, first_item):
        """
@@ -1016,11 +1024,11 @@ class GpuCAReduceCuda(HideC, CAReduce):
            result to left."""

        x, = node.inputs
+        in_dtype = x.dtype
+        out_dtype = node.outputs[0].dtype

-        dtype = x.dtype
-
-        dummy_left = Scalar(dtype=dtype)()
-        dummy_right = Scalar(dtype=dtype)()
+        dummy_left = Scalar(dtype=out_dtype)()
+        dummy_right = Scalar(dtype=in_dtype)()

        dummy_node = self.scalar_op.make_node(dummy_left, dummy_right)

@@ -1037,6 +1045,9 @@ class GpuCAReduceCuda(HideC, CAReduce):
        node, name, sub: these should be passed through from the original
        call to c_code
        """
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)

        # This code (the code in new_version) is currently ignored.
        # Code produced later in this function is returned instead.
@@ -1052,7 +1063,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
        {
            int idx = threadNum - (threadCount >> 1) * 2;"""

-        new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub)
+        new_version += self._assign_reduce(node, name, 'buf[idx]',
+                                           'buf[threadNum]', sub)

        new_version += """
        }
@@ -1068,7 +1080,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
            if (threadNum < halfPoint)
            {
              // Get the shared value stored by another thread
-              float temp = buf[threadNum + halfPoint];
+              %(acc_dtype)s temp = buf[threadNum + halfPoint];
              """

        new_version += self._assign_reduce(node, name,
@@ -1116,6 +1128,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
                                                   'buf[threadNum]',
                                                   'buf[threadNum+%d]' % num,
                                                   sub)
+            current_version += """
+            """
        current_version += """
                if (threadNum == 0)
                {
@@ -1134,6 +1148,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
                                    'buf[threadNum]','buf[threadNum+%d]' % num,
                                    sub)
            current_version += this_if
+            current_version += """
+            """
        current_version += """
                if (threadNum == 0)
                {
@@ -1175,8 +1191,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
        is for the case where we are reducing on all axes and x is
        C contiguous.
        """
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
        if getattr(self.scalar_op, 'identity', None) == 0:
-            zero_shp = "cudaMemset((float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(float))" % locals()
+            zero_shp = "cudaMemset((%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset), 0, PyGpuArray_SIZE(%(z)s) * sizeof(%(out_dtype)s))" % locals()
        #TODO: elif getattr(self.scalar_op, 'identity', None) == 1:
        else:
            zero_shp = """
@@ -1185,6 +1203,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
            %(fail)s;
            """ % locals()

+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
        sync = ""
        if config.gpuarray.sync:
            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
@@ -1202,11 +1221,11 @@ class GpuCAReduceCuda(HideC, CAReduce):
                                " n_threads.x=%%d, size=%%d, ndim=%%d\\n",
                                n_threads.x,PyGpuArray_SIZE(%(x)s),
                                PyGpuArray_NDIM(%(x)s));
-            int n_shared = sizeof(float) * n_threads.x;
+            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
            kernel_reduce_ccontig_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    PyGpuArray_SIZE(%(x)s),
-                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                    (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
+                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset));
            %(sync)s
            cudaError_t sts = cudaGetLastError();
            if (cudaSuccess != sts)
@@ -1265,12 +1284,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
        """

        assert N in [1, 2, 3]
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
        makecall = self._makecall(node, name, x, z, fail)
        N_pattern = ''.join(['1'] * N)
        param_dim = ",".join(["PyGpuArray_DIMS(%s)[%d]" % (x, i)
                              for i in xrange(N + 1)])
-        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/4"
-                                % (x, i) for i in xrange(N + 1)])
+        strides_dim = ",".join(["PyGpuArray_STRIDES(%s)[%d]/sizeof(%s)"
+                                % (x, i, in_dtype) for i in xrange(N + 1)])

        threads_y = """
            //get as many y threads as we can fit
@@ -1326,6 +1347,9 @@ class GpuCAReduceCuda(HideC, CAReduce):
        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)

    def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
        sync = ""
        if config.gpuarray.sync:
            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
@@ -1345,18 +1369,18 @@ class GpuCAReduceCuda(HideC, CAReduce):
                n_blocks.y);
            }
            assert( PyGpuArray_DIMS(%(x)s)[1] == PyGpuArray_DIMS(%(z)s)[0]);
-            int n_shared = sizeof(float) * n_threads.x;
+            int n_shared = sizeof(%(acc_dtype)s) * n_threads.x;
            kernel_reduce_010_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    1,
                    PyGpuArray_DIMS(%(x)s)[0],
                    PyGpuArray_DIMS(%(x)s)[1],
-                    (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                    (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
                    1,
-                    PyGpuArray_STRIDES(%(x)s)[0]/4,
-                    PyGpuArray_STRIDES(%(x)s)[1]/4,
-                    (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                    PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
+                    PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
+                    (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
                    1,
-                    PyGpuArray_STRIDES(%(z)s)[0]/4
+                    PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s)
                    );
            %(sync)s
            cudaError_t sts = cudaGetLastError();
@@ -1382,6 +1406,8 @@ class GpuCAReduceCuda(HideC, CAReduce):
        makecall_inner = self._makecall(node, name, x, z, fail,
                                        pattern="010_inner")
        pattern = ''.join(str(i) for i in self.reduce_mask)
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
        sync = ""
        if config.gpuarray.sync:
            sync = """GpuArray_sync(&%(z)s->ga);""" % locals()
@@ -1421,13 +1447,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                int n_shared = 0;
                kernel_reduce_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                A,B,C,D,
-                        (float *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
-                        PyGpuArray_STRIDES(%(x)s)[0]/4,
-                        PyGpuArray_STRIDES(%(x)s)[1]/4,
-                        PyGpuArray_STRIDES(%(x)s)[2]/4,
-                        (float *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
-                        PyGpuArray_STRIDES(%(z)s)[0]/4,
-                        PyGpuArray_STRIDES(%(z)s)[1]/4
+                        (%(in_dtype)s *)(((char *)cuda_get_ptr(%(x)s->ga.data))+%(x)s->ga.offset),
+                        PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
+                        PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s),
+                        PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s),
+                        (%(out_dtype)s *)(((char *)cuda_get_ptr(%(z)s->ga.data))+%(z)s->ga.offset),
+                        PyGpuArray_STRIDES(%(z)s)[0]/sizeof(%(out_dtype)s),
+                        PyGpuArray_STRIDES(%(z)s)[1]/sizeof(%(out_dtype)s)
                        );
                %(sync)s
                cudaError_t sts = cudaGetLastError();
@@ -1464,10 +1490,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
                                  (size_t)n_threads.x),
                      (size_t)(4096 / n_blocks.x)
                      );
-                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/4,
-                                     PyGpuArray_STRIDES(%(x)s)[1]/4),
-                            PyGpuArray_STRIDES(%(x)s)[2]/4)
-                   ==PyGpuArray_STRIDES(%(x)s)[2]/4
+                if(std::min(std::min(PyGpuArray_STRIDES(%(x)s)[0]/sizeof(%(in_dtype)s),
+                                     PyGpuArray_STRIDES(%(x)s)[1]/sizeof(%(in_dtype)s)),
+                            PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s))
+                   ==PyGpuArray_STRIDES(%(x)s)[2]/sizeof(%(in_dtype)s)
                  && n_blocks.y==ceil_intdiv(PyGpuArray_DIMS(%(x)s)[2],
                                             (size_t)n_threads.x)){
                  if(verbose>1)
@@ -1623,6 +1649,9 @@ class GpuCAReduceCuda(HideC, CAReduce):

    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
        makecall = self._makecall(node, name, x, z, fail)
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
        print >> sio, """
        {
            int verbose = 0;
@@ -1642,7 +1671,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                             (size_t) 256));
            while (n_threads.x * n_threads.y <= 256
                   && n_threads.y < PyGpuArray_DIMS(%(x)s)[2]
-                   && n_threads.x * n_threads.y * sizeof(float) <=(15*1024-200))
+                   && n_threads.x * n_threads.y * sizeof(%(acc_dtype)s) <=(15*1024-200))
            {
                n_threads.y += 1;
            }
@@ -1711,7 +1740,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
        """ % locals()

    def c_code_cache_version_apply(self, node):
-        version = [9]  # the version corresponding to the c code in this Op
+        version = [11]  # the version corresponding to the c code in this Op

        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -1728,6 +1757,10 @@ class GpuCAReduceCuda(HideC, CAReduce):
    def c_support_code_apply(self, node, nodename):
        sio = StringIO()
        nd_in = len(self.reduce_mask)
+        in_dtype = "npy_" + node.inputs[0].dtype
+        out_dtype = "npy_" + node.outputs[0].dtype
+        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
+
        if all(i == 1 for i in self.reduce_mask):
            #this kernel is ok for up to a few thousand elements, but
            # it only runs on ONE multiprocessor
@@ -1739,13 +1772,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
            print >> sio, """
            static __global__ void kernel_reduce_ccontig_%(nodename)s(
                    const unsigned int d0,
-                    const float *A,
-                    float * Z)
+                    const %(in_dtype)s *A,
+                    %(out_dtype)s * Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(acc_dtype)s buf[];
+                %(acc_dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -1770,13 +1803,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
            print >> sio, """
            static __global__ void kernel_reduce_1_%(nodename)s(
                    const unsigned int d0,
-                    const float *A, const int sA0,
-                    float * Z)
+                    const %(in_dtype)s *A, const int sA0,
+                    %(out_dtype)s * Z)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(acc_dtype)s buf[];
+                %(acc_dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -1803,13 +1836,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
            static __global__ void kernel_reduce_11_%(nodename)s(
                    const int d0,
                    const int d1,
-                    const float *A, const int sA0, const int sA1,
-                    float * Z)
+                    const %(in_dtype)s *A, const int sA0, const int sA1,
+                    %(out_dtype)s * Z)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(acc_dtype)s buf[];
+                %(acc_dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -1915,13 +1948,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A, const int sA0,
+                    const %(in_dtype)s *A, const int sA0,
                    const int sA1, const int sA2,
-                    float * Z, const int sZ0, const int sZ1)
+                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
+                extern __shared__ %(acc_dtype)s buf[];

                if (warpSize != 32)
                {
@@ -1933,7 +1966,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(acc_dtype)s myresult = %(reduce_init)s;
                        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -1956,13 +1989,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int C,
                    const int D,
                    //const int E, // THIS is 32
-                    const float *X, const int sX0,
+                    const %(in_dtype)s *X, const int sX0,
                    const int sX1, const int sX2,
-                    float * Z, const int sZ0, const int sZ1)
+                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                float myresult = 0.0f;
+                %(acc_dtype)s myresult = 0;

                if (warpSize != 32)
                {
@@ -2050,14 +2083,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A, const int sA0,
+                    const %(in_dtype)s *A, const int sA0,
                    const int sA1, const int sA2,
-                    float * Z, const int sZ0)
+                    %(out_dtype)s * Z, const int sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(acc_dtype)s buf[];
+                %(acc_dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {
@@ -2145,13 +2178,13 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const int d0,
                    const int d1,
                    const int d2,
-                    const float *A, const int sA0,
+                    const %(in_dtype)s *A, const int sA0,
                    const int sA1, const int sA2,
-                    float * Z, const int sZ0, const int sZ1)
+                    %(out_dtype)s * Z, const int sZ0, const int sZ1)
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                extern __shared__ float buf[];
+                extern __shared__ %(acc_dtype)s buf[];

                if (warpSize != 32)
                {
@@ -2162,7 +2195,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(acc_dtype)s myresult = %(reduce_init)s;
                        for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x)
                        {
                            %(reduce_fct)s;
@@ -2192,7 +2225,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(acc_dtype)s myresult = %(reduce_init)s;
                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2225,7 +2258,7 @@ class GpuCAReduceCuda(HideC, CAReduce):
                {
                    for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
                    {
-                        float myresult = %(reduce_init)s;
+                        %(acc_dtype)s myresult = %(reduce_init)s;
                    for (int i1 = threadIdx.y; i1 < d1; i1 += blockDim.y)
                    {
                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
@@ -2279,14 +2312,14 @@ class GpuCAReduceCuda(HideC, CAReduce):
                    const unsigned int d1,
                    const unsigned int d2,
                    const unsigned int d3,
-                    const float *A, const int sA0, const int sA1,
+                    const %(in_dtype)s *A, const int sA0, const int sA1,
                    const int sA2, const int sA3,
-                    float * Z, const int sZ0)
+                    %(out_dtype)s * Z, const int sZ0)
            {
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-                extern __shared__ float buf[];
-                float myresult = %(reduce_init)s;
+                extern __shared__ %(acc_dtype)s buf[];
+                %(acc_dtype)s myresult = %(reduce_init)s;

                if (warpSize != 32)
                {

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -344,14 +344,15 @@ def local_gpua_advanced_incsubtensor(node):


 @register_opt()
-@op_lifter([tensor.CAReduce, tensor.Sum])
+@op_lifter([tensor.CAReduce, tensor.Sum, tensor.elemwise.Prod])
 def local_gpua_careduce(node):
-    if (isinstance(node.op.scalar_op, scalar.basic.Add) or
-        isinstance(node.op.scalar_op, scalar.basic.Mul)):
+    if isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul,
+                                      scalar.Maximum, scalar.Minimum)):
        x, = node.inputs
-        greduce = GpuCAReduceCuda(node.op.scalar_op, axis=node.op.axis)
-        if x.dtype != "float32":
-            return
+        greduce = GpuCAReduceCuda(
+            node.op.scalar_op, axis=node.op.axis,
+            dtype=getattr(node.op, 'dtype', None),
+            acc_dtype=getattr(node.op, 'acc_dtype', None))
        gvar = greduce(x)
        #We need to have the make node called, otherwise the mask can
        #be None
@@ -384,10 +385,21 @@ def local_gpua_careduce(node):
                else:
                    new_mask.append(reduce_mask[i])
                    new_in_shp.append(x_shape[i])
+            new_axis = []
+            for idx, m in enumerate(new_mask):
+                if m == 1:
+                    new_axis.append(idx)
+            new_greduce = GpuCAReduceCuda(
+                node.op.scalar_op,
+                axis=new_axis, reduce_mask=new_mask,
+                dtype=getattr(node.op, 'dtype', None),
+                acc_dtype=getattr(node.op, 'acc_dtype', None))

-            new_greduce = GpuCAReduceCuda(new_mask, scalar_op)
            reshaped_x = x.reshape(tensor.stack(*new_in_shp))
            gpu_reshaped_x = gpu_from_host(reshaped_x)
+            gvar = greduce(gpu_reshaped_x)
+            #We need to have the make node called, otherwise the mask can
+            #be None
            reshaped_gpu_inputs = [gpu_reshaped_x]
            if new_greduce.supports_c_code(reshaped_gpu_inputs):
                reduce_reshaped_x = host_from_gpu(

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -2,9 +2,10 @@ from theano import scalar, gof
 from theano.gof.python25 import all, any

 from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
-                                               test_CAReduce)
+                                               test_CAReduce, T_reduce_dtype)

-from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
+from theano.sandbox.gpuarray.tests.test_basic_ops import (mode_with_gpu,
+                                                          rand_gpuarray)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
                                              GpuCAReduceCuda, GpuCAReduceCPY)
 from theano.sandbox.gpuarray.type import GpuArrayType
@@ -47,6 +48,8 @@ class test_GpuCAReduceCPY(test_CAReduce):

    def test_perform_nan(self):
        for dtype in self.dtypes:
+            if not dtype.startswith('float'):
+                continue
            for op in self.reds:
                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
                                 test_nan=True)
@@ -58,6 +61,8 @@ class test_GpuCAReduceCPY(test_CAReduce):

    def test_c_nan(self):
        for dtype in self.dtypes:
+            if not dtype.startswith('float'):
+                continue
            for op in self.reds:
                self.with_linker(gof.CLinker(), op, dtype=dtype,
                                 test_nan=True)
@@ -68,9 +73,9 @@ class test_GpuCAReduceCPY(test_CAReduce):


 class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
-    dtypes = ["float32"]
+    dtypes = ["float32", "int64"]
    bin_dtypes = ["uint8", "int8"]
-    bin_dtypes = []
+
    cases = [((5, 6), None),
             ((5, 6), (0, 1)),
             ((5, 6), (0, )),
@@ -129,9 +134,10 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
             ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
             ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
             ((65,4,3,2),[1,2,3]),((4,65,3,2),[1,2,3]),((4,3,65,2),[1,2,3]),((4,3,2,65),[1,2,3]),#0111
-             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,3,3), [0,1,2,3]),#1111
+             ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),((128,1,2,3), [0,1,2,3]),#1111

             #test pattern implemented by reshape
+             #Skip them as this test the op directly, not the optimization with reshape
 #             ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
 #             ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
 #             ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
@@ -140,10 +146,18 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
 #             ((5,4,3,10,11),[1,2]),
        ]
    op = GpuCAReduceCuda
-    reds = [scalar.add, scalar.mul]
+    reds = [scalar.add, scalar.mul,
+            scalar.maximum, scalar.minimum]

    def test_perform(self):
        return

    def test_perform_nan(self):
        return
+
+
+class T_gpureduce_dtype(T_reduce_dtype):
+    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
+    op = GpuCAReduceCuda
+    #Currently we don't support reduction on 0 axis
+    axes = [None, 0, 1, 1, [0], [1], [0, 1]]
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -46,16 +46,18 @@ def test_flatten():
                          for node in f.maker.fgraph.toposort()]


-def test_sum_prod():
-    for method in ['sum']:
+def test_reduce():
+    for method in ['sum', 'prod', 'max', 'min']:
        m = theano.tensor.fmatrix()
-        f = theano.function([m], getattr(m, method)(), mode=mode_with_gpu)
+        f = theano.function([m], getattr(m, method)(axis=0),
+                            mode=mode_with_gpu)
        val = numpy.random.rand(10, 11).astype("float32")
        res = f(val)
-        utt.assert_allclose(res, val.sum())
-        assert res.shape == ()
+        utt.assert_allclose(res, getattr(val, method)(axis=0))
+        assert res.shape == (11,)
+        topo = f.maker.fgraph.toposort()
        assert GpuCAReduceCuda in [type(node.op)
-                                   for node in f.maker.fgraph.toposort()]
+                                   for node in topo], topo


 def test_local_gpualloc_memset_0():

--- a/theano/scalar/basic.py
+++ b/theano/scalar/basic.py
@@ -2335,7 +2335,10 @@ class Expm1(UnaryScalarOp):
    def c_code(self, node, name, (x, ), (z, ), sub):
        if node.inputs[0].type in complex_types:
            raise NotImplementedError('type not supported', type)
-        return "%(z)s = exp(%(x)s) - 1;" % locals()
+        return "%(z)s = expm1(%(x)s);" % locals()
+
+    def c_code_cache_version(self):
+        return (5,)
 expm1 = Expm1(upgrade_to_float, name='expm1')



--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -716,39 +716,47 @@ class test_IsInf_IsNan(unittest.TestCase):
        return self.run_isfunc('isnan')


-class T_sum_dtype(unittest.TestCase):
-    def test_sum_default_dtype(self):
+class T_reduce_dtype(unittest.TestCase):
+    mode = theano.compile.get_default_mode().excluding(
+        'local_cut_useless_reduce')
+    op = CAReduce
+    axes = [None, 0, 1, [], [0], [1], [0, 1]]
+    methods = ['sum', 'prod']
+
+    def test_reduce_default_dtype(self):
        """
-        Test the default dtype of a sum().
+        Test the default dtype of a method().
        """
        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
-        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
-            axis = axes[idx % len(axes)]
-            x = tensor.matrix(dtype=dtype)
-            s = x.sum(axis=axis)
-            assert s.dtype == dict(
+        for method in self.methods:
+            for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+                axis = self.axes[idx % len(self.axes)]
+                x = tensor.matrix(dtype=dtype)
+                s = getattr(x, method)(axis=axis)
+                assert s.dtype == dict(
                    int8='int64',
                    int16='int64',
                    int32='int64',
                    uint8='uint64',
                    uint16='uint64',
                    uint32='uint64',
-                    ).get(dtype, dtype)
-            f = theano.function([x], s)
-            data = numpy.random.rand(3, 4) * 10
-            data = data.astype(dtype)
-            f(data)
+                ).get(dtype, dtype)
+                f = theano.function([x], s, mode=self.mode)
+                topo = f.maker.fgraph.toposort()
+                assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+                data = numpy.random.rand(3, 4) * 10
+                data = data.astype(dtype)
+                f(data)

-    def test_sum_default_acc_dtype(self):
-        ##Test the default acc_dtype of a sum().
+    def test_reduce_default_acc_dtype(self):
+        ##Test the default acc_dtype of a reduce().
        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
-        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
-            axis = axes[idx % len(axes)]
-            x = tensor.matrix(dtype=dtype)
-            s = x.sum(axis=axis)
-            assert s.owner.op.acc_dtype == dict(
+        for method in self.methods:
+            for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+                axis = self.axes[idx % len(self.axes)]
+                x = tensor.matrix(dtype=dtype)
+                s = getattr(x, method)(axis=axis)
+                assert s.owner.op.acc_dtype == dict(
                    int8='int64',
                    int16='int64',
                    int32='int64',
@@ -757,91 +765,102 @@ class T_sum_dtype(unittest.TestCase):
                    uint32='uint64',
                    float32='float64',
                    complex64='complex128',
-                    ).get(dtype, dtype)
-            f = theano.function([x], s)
-            data = numpy.random.rand(3, 4) * 10
-            data = data.astype(dtype)
-            f(data)
+                ).get(dtype, dtype)
+                f = theano.function([x], s, mode=self.mode)
+                topo = f.maker.fgraph.toposort()
+                assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+                data = numpy.random.rand(3, 4) * 10
+                data = data.astype(dtype)
+                f(data)

    @attr('slow')
-    def test_sum_custom_dtype(self):
+    def test_reduce_custom_dtype(self):
        """
-        Test the ability to provide your own output dtype for a sum.
+        Test the ability to provide your own output dtype for a reduce.
        """
        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
        idx = 0
-        for input_dtype in imap(str, theano.scalar.all_types):
-            x = tensor.matrix(dtype=input_dtype)
-            for output_dtype in imap(str, theano.scalar.all_types):
-                # If the output is a complex, the gradient of the sum will
+        for method in self.methods:
+            for input_dtype in imap(str, theano.scalar.all_types):
+                x = tensor.matrix(dtype=input_dtype)
+                for output_dtype in imap(str, theano.scalar.all_types):
+                # If the output is a complex, the gradient of the reduce will
                # cast the complex to the input dtype. We can't call the normal
                # cast on a complex to a not complex as this is ambiguous.
-                if (not input_dtype.startswith('complex') and
-                    output_dtype.startswith('complex')):
-                    continue
+                    if (not input_dtype.startswith('complex') and
+                        output_dtype.startswith('complex')):
+                        continue

-                axis = axes[idx % len(axes)]
-                sum_var = x.sum(dtype=output_dtype, axis=axis)
-                assert sum_var.dtype == output_dtype
+                    axis = self.axes[idx % len(self.axes)]
+                    var = getattr(x, method)(dtype=output_dtype, axis=axis)
+                    assert var.dtype == output_dtype

-                f = theano.function([x], sum_var)
-                data = numpy.random.rand(3, 4) * 10
-                data = data.astype(input_dtype)
-                f(data)
-                if "complex" in input_dtype:
-                    continue
-                # Check that we can take the gradient
-                tensor.grad(sum_var.sum(), x,
-                            disconnected_inputs='ignore')
-                idx += 1
+                    f = theano.function([x], var, mode=self.mode)
+                    topo = f.maker.fgraph.toposort()
+                    assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+                    data = numpy.random.rand(3, 4) * 10
+                    data = data.astype(input_dtype)
+                    f(data)
+                    if "complex" in input_dtype:
+                        continue
+                    # Check that we can take the gradient
+                    tensor.grad(var.sum(), x,
+                                disconnected_inputs='ignore')
+                    idx += 1

-    def test_sum_custom_acc_dtype(self):
+    def test_reduce_custom_acc_dtype(self):
        """
-        Test the ability to provide your own accumulator dtype for a sum.
+        Test the ability to provide your own accumulator dtype for a reduce.
        """
        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
        idx = 0
-        for input_dtype in imap(str, theano.scalar.all_types):
-            x = tensor.matrix(dtype=input_dtype)
-            for acc_dtype in imap(str, theano.scalar.all_types):
-                # If the accumulator is a complex, the gradient of the sum will
+        for method in self.methods:
+            for input_dtype in imap(str, theano.scalar.all_types):
+                x = tensor.matrix(dtype=input_dtype)
+                for acc_dtype in imap(str, theano.scalar.all_types):
+                # If the accumulator is a complex, the gradient of the reduce will
                # cast the complex to the input dtype. We can't call the normal
                # cast on a complex to a not complex as this is ambiguous.
-                if (not input_dtype.startswith('complex') and
-                    acc_dtype.startswith('complex')):
-                    continue
+                    if (not input_dtype.startswith('complex') and
+                        acc_dtype.startswith('complex')):
+                        continue

-                axis = axes[idx % len(axes)]
+                    axis = self.axes[idx % len(self.axes)]
                # If output_dtype would force a downcast, we expect a TypeError
                # We always allow int/uint inputs with float/complex outputs.
-                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
-                if (acc_dtype == upcasted_dtype or
+                    upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
+                    if (acc_dtype == upcasted_dtype or
                        (input_dtype in tensor.discrete_dtypes and
                            acc_dtype in tensor.continuous_dtypes)
                        ):
-                    sum_var = x.sum(acc_dtype=acc_dtype, axis=axis)
-                    assert sum_var.owner.op.acc_dtype == acc_dtype
+                        var = getattr(x, method)(acc_dtype=acc_dtype, axis=axis)
+                        assert var.owner.op.acc_dtype == acc_dtype

-                    if "complex" in input_dtype:
-                        continue
+                        if "complex" in input_dtype:
+                            continue
                    # Check that we can take the gradient
-                    tensor.grad(sum_var.sum(), x,
-                                disconnected_inputs='ignore')
-                else:
-                    self.assertRaises(TypeError,
-                            x.sum, acc_dtype=acc_dtype, axis=axis)
+                        tensor.grad(var.sum(), x,
+                                    disconnected_inputs='ignore')
+                    else:
+                        self.assertRaises(TypeError,
+                                          getattr(x, method),
+                                          acc_dtype=acc_dtype, axis=axis)

-                idx += 1
+                    idx += 1

-    def test_sum_precision(self):
+    def test_reduce_precision(self):
        # Check that the default accumulator precision is sufficient
-        x = theano.shared(numpy.asarray([1e8, 1, -1e8], dtype='float32'))
-        s = x.sum()
-        f = theano.function([], s)
-        s_val = f()
-        assert numpy.allclose(s_val, 1)
+        for method in self.methods:
+            x = theano.shared(numpy.asarray([1e8, 1, -1e8],
+                                            dtype='float32'))
+            s = getattr(x, method)()
+            f = theano.function([], s, mode=self.mode)
+            topo = f.maker.fgraph.toposort()
+            assert [n for n in topo if isinstance(n.op, self.op)], (topo, dtype)
+            s_val = f()
+            # Use extra precision in NumPy to compute the good answer.
+            ret = getattr(numpy.asarray([1e8, 1, -1e8], dtype='float64'), method)()
+            assert numpy.allclose(s_val, ret), (s_val, ret)


 class T_mean_dtype(unittest.TestCase):
@@ -923,129 +942,6 @@ class T_mean_dtype(unittest.TestCase):
        assert numpy.allclose(m_val, 1. / 3)


-class T_prod_dtype(unittest.TestCase):
-    def test_prod_default_dtype(self):
-        """
-        Test the default dtype of a prod().
-        """
-        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
-        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
-            axis = axes[idx % len(axes)]
-            x = tensor.matrix(dtype=dtype)
-            p = x.prod(axis=axis)
-            assert p.dtype == dict(
-                    int8='int64',
-                    int16='int64',
-                    int32='int64',
-                    uint8='uint64',
-                    uint16='uint64',
-                    uint32='uint64',
-                    ).get(dtype, dtype)
-            f = theano.function([x], p)
-            data = numpy.random.rand(3, 4) * 10
-            data = data.astype(dtype)
-            f(data)
-
-    def test_prod_default_acc_dtype(self):
-        """
-        Test the default acc_dtype of a prod().
-        """
-        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
-        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
-            axis = axes[idx % len(axes)]
-            x = tensor.matrix(dtype=dtype)
-            p = x.prod(axis=axis)
-            assert p.owner.op.acc_dtype == dict(
-                    int8='int64',
-                    int16='int64',
-                    int32='int64',
-                    uint8='uint64',
-                    uint16='uint64',
-                    uint32='uint64',
-                    float32='float64',
-                    complex64='complex128',
-                    ).get(dtype, dtype)
-            f = theano.function([x], p)
-            data = numpy.random.rand(3, 4) * 10
-            data = data.astype(dtype)
-            f(data)
-
-    @attr('slow')
-    def test_prod_custom_dtype(self):
-        """
-        Test the ability to provide your own output dtype for a prod.
-        """
-        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
-        idx = 0
-        for input_dtype in imap(str, theano.scalar.all_types):
-            x = tensor.matrix(dtype=input_dtype)
-            for output_dtype in imap(str, theano.scalar.all_types):
-                axis = axes[idx % len(axes)]
-                idx += 1
-                prod_var = x.prod(dtype=output_dtype, axis=axis)
-                assert prod_var.dtype == output_dtype
-
-                if (('complex' in output_dtype or
-                    'complex' in input_dtype) and
-                    input_dtype != output_dtype):
-                    continue
-
-                f = theano.function([x], prod_var)
-                data = numpy.random.rand(3, 4) * 10
-                data = data.astype(input_dtype)
-                f(data)
-
-                if "complex" in output_dtype or "complex" in input_dtype:
-                    continue
-                # Check that we can take the gradient
-                tensor.grad(prod_var.sum(), x,
-                            disconnected_inputs='ignore')
-
-    @attr('slow')
-    def test_prod_custom_acc_dtype(self):
-        """
-        Test the ability to provide your own acc_dtype for a prod.
-        """
-        # We try multiple axis combinations even though axis should not matter.
-        axes = [None, 0, 1, [], [0], [1], [0, 1]]
-        idx = 0
-        for input_dtype in imap(str, theano.scalar.all_types):
-            x = tensor.matrix(dtype=input_dtype)
-            for acc_dtype in imap(str, theano.scalar.all_types):
-                axis = axes[idx % len(axes)]
-                # If acc_dtype would force a downcast, we expect a TypeError
-                # We always allow int/uint inputs with float/complex outputs.
-                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
-                if (acc_dtype == upcasted_dtype or
-                        (input_dtype in tensor.discrete_dtypes and
-                            acc_dtype in tensor.continuous_dtypes)
-                        ):
-                    prod_var = x.prod(acc_dtype=acc_dtype, axis=axis)
-                    assert prod_var.owner.op.acc_dtype == acc_dtype
-
-                    if (acc_dtype.startswith('complex') and
-                        input_dtype != acc_dtype):
-                        continue
-                    f = theano.function([x], prod_var)
-                    data = numpy.random.rand(3, 4) * 10
-                    data = data.astype(input_dtype)
-                    f(data)
-
-                    if "complex" in acc_dtype:
-                        continue
-                    # Check that we can take the gradient
-                    tensor.grad(prod_var.sum(), x,
-                                disconnected_inputs='ignore')
-                else:
-                    self.assertRaises(TypeError,
-                            x.prod, acc_dtype=acc_dtype, axis=axis)
-
-                idx += 1
-
-
 class T_prod_without_zeros_dtype(unittest.TestCase):
    def test_prod_without_zeros_default_dtype(self):
        """