Merge pull request #1226 from lamblin/stable_reduce

More stable reduce operations by default

Merge pull request #1226 from lamblin/stable_reduce
3cb9ac35 · nouiz · 3b31bfa8 · 47ad172d · 3cb9ac35 · 3cb9ac35
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -715,13 +715,34 @@ Reductions
    if axis=None, Theano 0.5rc1 or later: argmin over the flattened tensor (like numpy)
                  older: then axis is assumed to be ndim(x)-1
-.. function:: sum(x, axis=None, keepdims=False)
+.. function:: sum(x, axis=None, dtype=None, keepdims=False, acc_dtype=None)
    :Parameter: *x* -  symbolic Tensor (or compatible)
    :Parameter: *axis* - axis or axes along which to compute the sum
+    :Parameter: *dtype* - The dtype of the returned tensor.
+        If None, then we use the default dtype which is the same as
+        the input tensor's dtype except when:
+        - the input dtype is a signed integer of precision < 64 bit, in
+          which case we use int64
+        - the input dtype is an unsigned integer of precision < 64 bit, in
+          which case we use uint64
+        This default dtype does _not_ depend on the value of "acc_dtype".
    :Parameter: *keepdims* - (boolean) If this is set to True, the axes which are reduced are
 		left in the result as dimensions with size one. With this option, the result
 		will broadcast correctly against the original tensor.
+    :Parameter: *acc_dtype* -  The dtype of the internal accumulator.
+        If None (default), we use the dtype in the list below,
+        or the input dtype if its precision is higher:
+        - for int dtypes, we use at least int64;
+        - for uint dtypes, we use at least uint64;
+        - for float dtypes, we use at least float64;
+        - for complex dtypes, we use at least complex128.
    :Returns: sum of *x* along *axis*
    axis can be:
@@ -729,13 +750,34 @@ Reductions
     * an *int* - computed along this axis
     * a *list of ints* - computed along these axes
-.. function:: prod(x, axis=None, keepdims=False)
+.. function:: prod(x, axis=None, dtype=None, keepdims=False, acc_dtype=None)
    :Parameter: *x* -  symbolic Tensor (or compatible)
    :Parameter: *axis* - axis or axes along which to compute the product
+    :Parameter: *dtype* - The dtype of the returned tensor.
+        If None, then we use the default dtype which is the same as
+        the input tensor's dtype except when:
+        - the input dtype is a signed integer of precision < 64 bit, in
+          which case we use int64
+        - the input dtype is an unsigned integer of precision < 64 bit, in
+          which case we use uint64
+        This default dtype does _not_ depend on the value of "acc_dtype".
    :Parameter: *keepdims* - (boolean) If this is set to True, the axes which are reduced are
 		left in the result as dimensions with size one. With this option, the result
 		will broadcast correctly against the original tensor.
+    :Parameter: *acc_dtype* -  The dtype of the internal accumulator.
+        If None (default), we use the dtype in the list below,
+        or the input dtype if its precision is higher:
+        - for int dtypes, we use at least int64;
+        - for uint dtypes, we use at least uint64;
+        - for float dtypes, we use at least float64;
+        - for complex dtypes, we use at least complex128.
    :Returns: product of every term in *x* along *axis*
    axis can be:
@@ -743,13 +785,22 @@ Reductions
     * an *int* - computed along this axis
     * a *list of ints* - computed along these axes
-.. function:: mean(x, axis=None, keepdims=False)
+.. function:: mean(x, axis=None, dtype=None, keepdims=False, acc_dtype=None)
    :Parameter: *x* -  symbolic Tensor (or compatible)
    :Parameter: *axis* - axis or axes along which to compute the mean
+    :Parameter: *dtype* - The dtype to cast the result of the inner summation into.
+        For instance, by default, a sum of a float32 tensor will be
+        done in float64 (acc_dtype would be float64 by default),
+        but that result will be casted back in float32.
    :Parameter: *keepdims* - (boolean) If this is set to True, the axes which are reduced are
 		left in the result as dimensions with size one. With this option, the result
 		will broadcast correctly against the original tensor.
+    :Parameter: *acc_dtype* -  The dtype of the internal accumulator of the
+        inner summation. This will not necessarily be the dtype of the
+        output (in particular if it is a discrete (int/uint) dtype, the
+        output will be in a float type).  If None, then we use the same
+        rules as :ref:`sum()`.
    :Returns: mean value of *x* along *axis*
    axis can be:

--- a/theano/gradient.py
+++ b/theano/gradient.py
@@ -458,7 +458,8 @@ def grad(cost, wrt, consider_constant=None,
            g_cost = g_cost.astype(cost.type.dtype)
        # DO NOT enforce g_cost to be 0 if cost is an integer.
        # This is to be enforced by the Op.grad method for the Op that outputs cost.
-        assert g_cost not in tensor.discrete_dtypes
+        if hasattr(g_cost.type, 'dtype'):
+            assert g_cost.type.dtype not in tensor.discrete_dtypes
        grad_dict[cost] = g_cost

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1826,13 +1826,15 @@ class _tensor_py_operators:
    dot = __dot__
-    def sum(self, axis=None, dtype=None, keepdims=False):
+    def sum(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
        """See `theano.tensor.sum`"""
-        return sum(self, axis=axis, dtype=dtype, keepdims=keepdims)
+        return sum(self, axis=axis, dtype=dtype, keepdims=keepdims,
+                acc_dtype=acc_dtype)
-    def prod(self, axis=None, dtype=None, keepdims=False):
+    def prod(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
        """See `theano.tensor.prod`"""
-        return prod(self, axis=axis, dtype=dtype, keepdims=keepdims)
+        return prod(self, axis=axis, dtype=dtype, keepdims=keepdims,
+                acc_dtype=acc_dtype)
    def norm(self, L, axis=None):
        if L == 0:
@@ -1842,9 +1844,10 @@ class _tensor_py_operators:
        # optimizations will/should catch cases like L=1, L=2
        return pow(pow(abs_(self), L).sum(axis=axis), 1.0 / L)
-    def mean(self, axis=None, dtype=None, keepdims=False):
+    def mean(self, axis=None, dtype=None, keepdims=False, acc_dtype=None):
        """See `theano.tensor.mean`"""
-        return mean(self, axis=axis, dtype=dtype, keepdims=keepdims)
+        return mean(self, axis=axis, dtype=dtype, keepdims=keepdims,
+                acc_dtype=acc_dtype)
    def var(self, axis=None, keepdims=False):
        """See `theano.tensor.var`"""
@@ -3777,7 +3780,7 @@ pprint.assign(tensor_copy, printing.IgnorePrinter())
 @constructor
-def sum(input, axis=None, dtype=None, keepdims=False):
+def sum(input, axis=None, dtype=None, keepdims=False, acc_dtype=None):
    """
    Computes the sum along the given axis(es) of a tensor `input`
@@ -3790,10 +3793,10 @@ def sum(input, axis=None, dtype=None, keepdims=False):
    For full documentation see ``tensor.elemwise.Sum``.
    In particular please pay attention to the important warning when using
-    a custom dtype.
+    a custom acc_dtype.
    """
-    out = elemwise.Sum(axis=axis, dtype=dtype)(input)
+    out = elemwise.Sum(axis=axis, dtype=dtype, acc_dtype=acc_dtype)(input)
    if keepdims:
        out = makeKeepDims(input, out, axis)
@@ -3803,7 +3806,7 @@ pprint.assign(Sum(), printing.FunctionPrinter('sum'))
 @constructor
-def prod(input, axis=None, dtype=None, keepdims=False):
+def prod(input, axis=None, dtype=None, keepdims=False, acc_dtype=None):
    """
    Computes the product along the given axis(es) of a tensor `input`
@@ -3817,7 +3820,7 @@ def prod(input, axis=None, dtype=None, keepdims=False):
    For full documentation see ``tensor.elemwise.Prod``.
    """
-    out = elemwise.Prod(axis, dtype=dtype)(input)
+    out = elemwise.Prod(axis, dtype=dtype, acc_dtype=acc_dtype)(input)
    if keepdims:
        out = makeKeepDims(input, out, axis)
@@ -3868,7 +3871,8 @@ class Mean(elemwise.CAReduce):
 @constructor
-def mean(input, axis=None, dtype=None, op=False, keepdims=False):
+def mean(input, axis=None, dtype=None, op=False, keepdims=False,
+        acc_dtype=None):
    """
    Computes the mean value along the given axis(es) of a tensor `input`
@@ -3876,17 +3880,23 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False):
                 None means all axes (like numpy).
    :type axis: None or int or (list of int) (see `Sum`)
-    :param dtype: dtype to use for the inner summation. This will not
+    :param dtype: dtype to cast the result of the inner summation into.
-                  necessarily be the dtype of the output (in particular
+        For instance, by default, a sum of a float32 tensor will be
-                  if it is a discrete (int/uint) dtype, the output will
+        done in float64 (acc_dtype would be float64 by default),
-                  be in a float type).
+        but that result will be casted back in float32.
-                  If None, then we use the same rules as `sum()`.
    :type dtype: None or string
    :param keepdims: If this is set to True, the axes which are reduced are
        left in the result as dimensions with size one. With this option,
        the result will broadcast correctly against the original tensor.
+    :param acc_dtype: dtype to use for the inner summation. This will not
+                  necessarily be the dtype of the output (in particular
+                  if it is a discrete (int/uint) dtype, the output will
+                  be in a float type).
+                  If None, then we use the same rules as `sum()`.
+    :type acc_dtype: None or string
    :note: for gpu, if you specify dtype=float32, everything will be done
           on the gpu.
    """
@@ -3898,6 +3908,12 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False):
                    'and will always use float64. If you want to specify '
                    'the dtype, call tensor.mean(..., op=False).',
                    dtype)
+        if acc_dtype not in (None, 'float64'):
+            raise NotImplementedError(
+                    'The Mean op does not support the acc_dtype argument, '
+                    'and will always use float64. If you want to specify '
+                    'acc_dtype, call tensor.mean(..., op=False).',
+                    dtype)
        out = Mean(axis)(input)
        if keepdims:
            out = makeKeepDims(input, out, axis)
@@ -3911,7 +3927,8 @@ def mean(input, axis=None, dtype=None, op=False, keepdims=False):
        # Let sum() infer the appropriate dtype.
        sum_dtype = None
-    s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims)
+    s = sum(input, axis=axis, dtype=sum_dtype, keepdims=keepdims,
+            acc_dtype=acc_dtype)
    shp = shape(input)
    # Cast shp into a float type

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -19,6 +19,10 @@ from theano.tensor import elemwise_cgen as cgen
 config = theano.config
+# We cannot import discrete_dtypes from tensor.basic yet,
+# so we redefine them here
+discrete_dtypes = map(str, scalar.discrete_types)
 # tensor depends on elemwise to provide definitions for several ops
 # but elemwise needs to make TensorType instances, so we have these as
@@ -1279,6 +1283,12 @@ class CAReduce(Op):
            axis = range(input.ndim)
        variable = input
        to_reduce = reversed(sorted(axis))
+        if hasattr(self, 'acc_dtype'):
+            acc_dtype = self.acc_dtype
+        else:
+            acc_dtype = node.outputs[0].type.dtype
        if to_reduce:
            for dimension in to_reduce:
                # If it's a zero-size array, use scalar_op.identity
@@ -1289,7 +1299,7 @@ class CAReduce(Op):
                        v_shape = list(variable.shape)
                        del v_shape[dimension]
                        variable = numpy.empty(tuple(v_shape),
-                                               dtype=variable.dtype)
+                                               dtype=acc_dtype)
                        variable.fill(self.scalar_op.identity)
                    else:
                        raise ValueError((
@@ -1307,7 +1317,8 @@ class CAReduce(Op):
                        variable = self.ufunc.reduce(variable, dimension,
                                dtype='object')
                    else:
-                        variable = self.ufunc.reduce(variable, dimension)
+                        variable = self.ufunc.reduce(variable, dimension,
+                                dtype=acc_dtype)
            variable = numpy.asarray(variable)
            if numpy.may_share_memory(variable, input):
@@ -1317,7 +1328,9 @@ class CAReduce(Op):
            output[0] = theano._asarray(variable,
                    dtype=node.outputs[0].type.dtype)
        else:
-            output[0] = numpy.copy(variable)
+            # Force a copy
+            output[0] = numpy.array(variable, copy=True,
+                                    dtype=node.outputs[0].type.dtype)
    def infer_shape(self, node, shapes):
        ishape, = shapes
@@ -1339,6 +1352,14 @@ class CAReduce(Op):
        idtype = input.type.dtype_specs()[1]
        odtype = output.type.dtype_specs()[1]
+        if hasattr(self, 'acc_dtype'):
+            acc_type = TensorType(
+                    broadcastable=node.outputs[0].broadcastable,
+                    dtype=self.acc_dtype)
+            adtype = acc_type.dtype_specs()[1]
+        else:
+            adtype = odtype
        axis = self.axis
        if axis is None:
            axis = range(len(input.type.broadcastable))
@@ -1356,13 +1377,25 @@ class CAReduce(Op):
        for i, (input, iname) in enumerate(izip(node.inputs, inames)):
            sub['lv%i' % i] = iname
-        decl = cgen.make_declare([order], [idtype], sub)
+        decl = ""
+        if adtype != odtype:
+            # Create an accumulator variable different from the output
+            aname = "acc"
+            decl = acc_type.c_declare(aname, sub)
+            decl += acc_type.c_init(aname, sub)
+        else:
+            # the output is the accumulator variable
+            aname = oname
+        decl += cgen.make_declare([order], [idtype], sub)
        checks = cgen.make_checks([order], [idtype], sub)
        alloc = ""
        i += 1
        sub['lv%i' % i] = oname
        sub['olv'] = oname
+        # Allocate output buffer
        alloc += cgen.make_declare(
                [range(nnested) + ['x'] * len(axis)],
                [odtype], dict(sub, lv0=oname))
@@ -1371,6 +1404,19 @@ class CAReduce(Op):
                [range(nnested) + ['x'] * len(axis)],
                [odtype], dict(sub, lv0=oname))
+        if adtype != odtype:
+            # Allocate accumulation buffer
+            sub['lv%i' % i] = aname
+            sub['olv'] = aname
+            alloc += cgen.make_declare(
+                    [range(nnested) + ['x'] * len(axis)],
+                    [adtype], dict(sub, lv0=aname))
+            alloc += cgen.make_alloc([order1], adtype, sub)
+            alloc += cgen.make_checks(
+                    [range(nnested) + ['x'] * len(axis)],
+                    [adtype], dict(sub, lv0=aname))
        if hasattr(self.scalar_op, 'identity'):
            identity = self.scalar_op.identity
        elif self.scalar_op in [scalar.maximum, scalar.minimum]:
@@ -1414,7 +1460,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
        task0_decl = (
                "%(dtype)s& %(name)s_i = *%(name)s_iter;\n"
                "%(name)s_i = %(identity)s;"
-                % dict(dtype=odtype, name=onames[0], identity=identity))
+                % dict(dtype=adtype, name=aname, identity=identity))
        task1_decl = ("%(dtype)s& %(name)s_i = *%(name)s_iter;\n"
                % dict(dtype=idtype, name=inames[0]))
@@ -1427,8 +1473,8 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
                    [Scalar(dtype=output.type.dtype)()
                        for input in node.outputs]),
                None,
-                ["%s_i" % onames[0], "%s_i" % inames[0]],
+                ["%s_i" % aname, "%s_i" % inames[0]],
-                ["%s_i" % onames[0]],
+                ["%s_i" % aname],
                sub)
        code1 = """
        {
@@ -1450,8 +1496,16 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
            all_code = [task0_decl + code1]
        loop = cgen.make_loop(
                [order, range(nnested) + ['x'] * len(axis)],
-                [idtype, odtype], all_code, sub)
+                [idtype, adtype], all_code, sub)
-        return decl, checks, alloc, loop
+        end = ""
+        if adtype != odtype:
+            end = """
+            PyArray_CopyInto(%(oname)s, %(aname)s);
+            """ % dict(oname=oname, aname=aname)
+            end += acc_type.c_cleanup(aname, sub)
+        return decl, checks, alloc, loop, end
    def c_code(self, node, name, inames, onames, sub):
        code = "\n".join(self._c_all(node, name, inames, onames, sub))
@@ -1462,7 +1516,7 @@ for(int i=0;i<PyArray_NDIM(%(iname)s);i++){
        return ['<vector>', '<algorithm>']
    def c_code_cache_version_apply(self, node):
-        version = [4]  # the version corresponding to the c code in this Op
+        version = [5]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(self.scalar_op,
@@ -1532,13 +1586,19 @@ class CAReduceDtype(CAReduce):
    Reduces a scalar operation along the specified axis(es).
    This subclass of CAReduce accepts an additional "dtype" parameter,
-    that specifies which dtype will be used for the accumulation.
+    that specifies which dtype the output should be.
+    It also accepts an optional "acc_dtype", which specify the dtype that
+    will be used for the accumulation.
+    So, the accumulation will be done into a tensor of dtype "acc_dtype",
+    then it will be casted into "dtype" and returned.
    If no dtype is provided, one will be inferred so as not to lose
    too much precision.
    """
-    def __init__(self, scalar_op, axis=None, dtype=None):
+    def __init__(self, scalar_op, axis=None, dtype=None, acc_dtype=None):
        """
        Usage: CAReduceDtype(scalar_op, axis=None, dtype=None)
@@ -1549,26 +1609,38 @@ class CAReduceDtype(CAReduce):
                     - list of dimensions that we want to reduce
                     - if None, all dimensions are reduced
-        :param dtype: The dtype of the internal accumulator and returned
+        :param dtype: The dtype of the returned
-        tensor. If None, then we use the default dtype which is the same as the
+            tensor. If None, then we use the default dtype which is the same
-        input tensor's dtype except when:
+            as the input tensor's dtype except when:
            - the input dtype is a signed integer of precision < 64 bit, in
              which case we use int64
            - the input dtype is an unsigned integer of precision < 64 bit, in
              which case we use uint64
-        This behavior is similar in spirit to that of numpy (except numpy
+            This default dtype does _not_ depend on the value of "acc_dtype".
-        uses the default machine integer while we always use 64 bit integers to
+            This behavior is similar in spirit to that of numpy (except numpy
-        avoid platform-dependent behavior).
+            uses the default machine integer while we always use 64 bit
+            integers to avoid platform-dependent behavior).
+        :param acc_dtype: The dtype of the internal accumulator.
+            If None (default), we use the dtype in the list below,
+            or the input dtype if its precision is higher:
+            - for int dtypes, we use at least int64;
+            - for uint dtypes, we use at least uint64;
+            - for float dtypes, we use at least float64;
+            - for complex dtypes, we use at least complex128.
        """
        CAReduce.__init__(self, scalar_op, axis=axis)
        self.dtype = dtype
+        self.acc_dtype = acc_dtype
    def __eq__(self, other):
-        return CAReduce.__eq__(self, other) and self.dtype == other.dtype
+        return (CAReduce.__eq__(self, other)
+                and self.dtype == other.dtype
+                and self.acc_dtype == other.acc_dtype)
    def __hash__(self):
-        return CAReduce.__hash__(self) ^ hash(self.dtype)
+        return CAReduce.__hash__(self) ^ hash((self.dtype, self.acc_dtype))
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -1578,6 +1650,11 @@ class CAReduceDtype(CAReduce):
            # could be in an apply node with a specified dtype.
            self.dtype = "OLD"
+        if not hasattr(self, "acc_dtype"):
+            # acc_dtype is not used by any external Op, so we do not
+            # need to keep the previous behaviour here.
+            self.acc_dtype = None
    def _output_dtype(self, idtype):
        dtype = self.dtype
        if dtype == "OLD":
@@ -1599,25 +1676,46 @@ class CAReduceDtype(CAReduce):
                    uint16='uint64',
                    uint32='uint64',
                    ).get(idtype, idtype)
-        elif (dtype in theano.tensor.continuous_dtypes and
-              idtype in theano.tensor.discrete_dtypes):
+        else:
-            # Specifying a continuous output for discrete input is OK
+            # The important is that the accumulator dtype does not
+            # lose precision. Then, the result can be downcasted.
            return dtype
+    def _acc_dtype(self, idtype):
+        acc_dtype = self.acc_dtype
+        if acc_dtype is None:
+            return dict(
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    float32='float64',
+                    complex64='complex128',
+                    ).get(idtype, idtype)
+        elif (acc_dtype in theano.tensor.continuous_dtypes and
+              idtype in theano.tensor.discrete_dtypes):
+            # Specifying a continuous accumulator for discrete input is OK
+            return acc_dtype
        else:
            # The conversion has to be considered an upcast.
-            upcasted_dtype = scalar.upcast(idtype, dtype)
+            upcasted_dtype = scalar.upcast(idtype, acc_dtype)
-            if dtype != upcasted_dtype:
+            if acc_dtype != upcasted_dtype:
                raise TypeError(
                        'Cannot build %s node with input dtype %s '
-                        'and output dtype %s, as precision would be lost. '
+                        'and acc_dtype %s, as precision would be lost. '
-                        'To correct this error, you can either:\n'
+                        'To correct this error, you can:\n'
-                        '  - not specify a dtype, or\n'
+                        '  - not specify acc_dtype, or\n'
-                        '  - use a dtype at least as precise as %s.\n'
+                        '  - use an acc_dtype at least as precise as %s.\n'
+                        '  - specify "dtype" instead of "acc_dtype", so '
+                        'the reduction will be precise, but the result will '
+                        'be casted into "dtype" at the end.\n'
                        'If you are expecting the precision loss, you can '
-                        'use tensor.cast(..., dtype="%s"), either on your '
+                        'use tensor.cast(..., dtype="%s"), on your input.'
-                        'input, or on the output of the reduce operation.'
+                        % (self, idtype, acc_dtype, upcasted_dtype, acc_dtype))
-                        % (self, idtype, dtype, upcasted_dtype, dtype))
+            return acc_dtype
-            return dtype
    def make_node(self, input):
        # We need to redefine make_node so that, if self.dtype is None,
@@ -1625,12 +1723,17 @@ class CAReduceDtype(CAReduce):
        # of the appropriate dtype.
        input = as_tensor_variable(input)
        dtype = self._output_dtype(input.dtype)
+        acc_dtype = self._acc_dtype(input.dtype)
        assert dtype is not None
-        if dtype == self.dtype:
+        assert acc_dtype is not None
+        if dtype == self.dtype and acc_dtype == self.acc_dtype:
            # Don't build another instance
            op = self
        else:
-            op = self.__class__(axis=self.axis, dtype=dtype)
+            op = self.__class__(axis=self.axis,
+                                dtype=dtype, acc_dtype=acc_dtype)
+        assert op.acc_dtype is not None
        return CAReduce.make_node(op, input)
@@ -1643,7 +1746,7 @@ class Sum(CAReduceDtype):
    tensor input.
    """
-    def __init__(self, axis=None, dtype=None):
+    def __init__(self, axis=None, dtype=None, acc_dtype=None):
        """
        Constructor.
@@ -1658,8 +1761,18 @@ class Sum(CAReduceDtype):
              which case we use int64
            - the input dtype is an unsigned integer of precision < 64 bit, in
              which case we use uint64
+            This value does not depend on the value of "acc_dtype".
+        :param acc_dtype: The dtype of the internal accumulator.
+            If None (default), we use the dtype in the list below,
+            or the input dtype if its precision is higher:
+            - for int dtypes, we use at least int64;
+            - for uint dtypes, we use at least uint64;
+            - for float dtypes, we use at least float64;
+            - for complex dtypes, we use at least complex128.
        """
-        CAReduceDtype.__init__(self, scalar.add, axis=axis, dtype=dtype)
+        CAReduceDtype.__init__(self, scalar.add, axis=axis,
+                               dtype=dtype, acc_dtype=acc_dtype)
    def grad(self, inp, grads):
        x, = inp
@@ -1667,7 +1780,7 @@ class Sum(CAReduceDtype):
        out = self(*inp)
        if out.dtype.find('int') != -1:
-            return [x.zeros_like().astype(theano.config.floatX)]
+            return [theano.tensor.zeros_like(x, dtype=theano.config.floatX)]
        gz, = grads
        gz = as_tensor_variable(gz)
@@ -1710,8 +1823,10 @@ class Prod(CAReduceDtype):
    difference that this defines the gradient of prod wrt its tensor
    input.
    """
-    def __init__(self, axis=None, dtype=None, no_zeros_in_input=False):
+    def __init__(self, axis=None, dtype=None, acc_dtype=None,
-        CAReduceDtype.__init__(self, scalar.mul, axis=axis, dtype=dtype)
+                 no_zeros_in_input=False):
+        CAReduceDtype.__init__(self, scalar.mul, axis=axis,
+                               dtype=dtype, acc_dtype=acc_dtype)
        self.no_zeros_in_input = no_zeros_in_input
    def __setstate__(self, dct):
@@ -1779,8 +1894,11 @@ class Prod(CAReduceDtype):
        out = self(*inp)
-        if out.dtype[0:3] in ('int', 'uin'):
+        if (out.dtype in discrete_dtypes or
-            return [prod_in.zeros_like().astype(theano.config.floatX)]
+                self.acc_dtype in discrete_dtypes):
+            # There is an int conversion in the way
+            return [theano.tensor.zeros_like(prod_in,
+                        dtype=theano.config.floatX)]
        # Prepare the broadcasting that is used everywhere to broadcast
        # over the original groups (ie. broadcast over the elements of a given
@@ -1893,8 +2011,9 @@ mul_without_zeros = MulWithoutZeros(scalar.upcast_out,
 class ProdWithoutZeros(CAReduceDtype):
-    def __init__(self, axis=None, dtype=None):
+    def __init__(self, axis=None, dtype=None, acc_dtype=None):
-        CAReduceDtype.__init__(self, mul_without_zeros, axis=axis, dtype=dtype)
+        CAReduceDtype.__init__(self, mul_without_zeros, axis=axis,
+                dtype=dtype, acc_dtype=acc_dtype)
    def __str__(self):
        if self.axis is None:

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -632,6 +632,24 @@ class T_sum_dtype(unittest.TestCase):
                    uint32='uint64',
                    ).get(dtype, dtype)
+    def test_sum_default_acc_dtype(self):
+        ##Test the default acc_dtype of a sum().
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = tensor.matrix(dtype=dtype).sum(axis=axis)
+            assert x.owner.op.acc_dtype == dict(
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    float32='float64',
+                    complex64='complex128',
+                    ).get(dtype, dtype)
    def test_sum_custom_dtype(self):
        """
        Test the ability to provide your own output dtype for a sum.
@@ -649,16 +667,44 @@ class T_sum_dtype(unittest.TestCase):
                    output_dtype.startswith('complex')):
                    continue
+                axis = axes[idx % len(axes)]
+                sum_var = x.sum(dtype=output_dtype, axis=axis)
+                assert sum_var.dtype == output_dtype
+                if "complex" in input_dtype:
+                    continue
+                # Check that we can take the gradient
+                grad_var = tensor.grad(sum_var.sum(), x,
+                        disconnected_inputs='ignore')
+                idx += 1
+    def test_sum_custom_acc_dtype(self):
+        """
+        Test the ability to provide your own accumulator dtype for a sum.
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [0], [1], [0, 1]]
+        idx = 0
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for acc_dtype in imap(str, theano.scalar.all_types):
+                # If the accumulator is a complex, the gradient of the sum will
+                # cast the complex to the input dtype. We can't call the normal
+                # cast on a complex to a not complex as this is ambiguous.
+                if (not input_dtype.startswith('complex') and
+                    acc_dtype.startswith('complex')):
+                    continue
                axis = axes[idx % len(axes)]
                # If output_dtype would force a downcast, we expect a TypeError
                # We always allow int/uint inputs with float/complex outputs.
-                upcasted_dtype = scalar.upcast(input_dtype, output_dtype)
+                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
-                if (output_dtype == upcasted_dtype or
+                if (acc_dtype == upcasted_dtype or
                        (input_dtype in tensor.discrete_dtypes and
-                            output_dtype in tensor.continuous_dtypes)
+                            acc_dtype in tensor.continuous_dtypes)
                        ):
-                    sum_var = x.sum(dtype=output_dtype, axis=axis)
+                    sum_var = x.sum(acc_dtype=acc_dtype, axis=axis)
-                    assert sum_var.dtype == output_dtype
+                    assert sum_var.owner.op.acc_dtype == acc_dtype
                    if "complex" in input_dtype:
                        continue
@@ -667,10 +713,18 @@ class T_sum_dtype(unittest.TestCase):
                            disconnected_inputs='ignore')
                else:
                    self.assertRaises(TypeError,
-                            x.sum, dtype=output_dtype, axis=axis)
+                            x.sum, acc_dtype=acc_dtype, axis=axis)
                idx += 1
+    def test_sum_precision(self):
+        # Check that the default accumulator precision is sufficient
+        x = theano.shared(numpy.asarray([1e8, 1, -1e8], dtype='float32'))
+        s = x.sum()
+        f = theano.function([], s)
+        s_val = f()
+        assert numpy.allclose(s_val, 1)
 class T_mean_dtype(unittest.TestCase):
    def test_mean_default_dtype(self):
@@ -688,7 +742,6 @@ class T_mean_dtype(unittest.TestCase):
                assert x.dtype == dtype, (x, x.dtype, dtype)
    def test_mean_custom_dtype(self):
        """
        Test the ability to provide your own output dtype for a mean.
        """
@@ -727,6 +780,14 @@ class T_mean_dtype(unittest.TestCase):
                idx += 1
+    def test_mean_precision(self):
+        # Check that the default accumulator precision is sufficient
+        x = theano.shared(numpy.asarray([1e8, 1, -1e8], dtype='float32'))
+        m = x.mean()
+        f = theano.function([], m)
+        m_val = f()
+        assert numpy.allclose(m_val, 1. / 3)
 class T_prod_dtype(unittest.TestCase):
    def test_prod_default_dtype(self):
@@ -747,6 +808,26 @@ class T_prod_dtype(unittest.TestCase):
                    uint32='uint64',
                    ).get(dtype, dtype)
+    def test_prod_default_acc_dtype(self):
+        """
+        Test the default acc_dtype of a prod().
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = tensor.matrix(dtype=dtype).prod(axis=axis)
+            assert x.owner.op.acc_dtype == dict(
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    float32='float64',
+                    complex64='complex128',
+                    ).get(dtype, dtype)
    def test_prod_custom_dtype(self):
        """
        Test the ability to provide your own output dtype for a prod.
@@ -758,24 +839,45 @@ class T_prod_dtype(unittest.TestCase):
            x = tensor.matrix(dtype=input_dtype)
            for output_dtype in imap(str, theano.scalar.all_types):
                axis = axes[idx % len(axes)]
-                # If output_dtype would force a downcast, we expect a TypeError
+                prod_var = x.prod(dtype=output_dtype, axis=axis)
+                assert prod_var.dtype == output_dtype
+                if "complex" in output_dtype or "complex" in input_dtype:
+                    continue
+                # Check that we can take the gradient
+                grad_var = tensor.grad(prod_var.sum(), x,
+                    disconnected_inputs='ignore')
+                idx += 1
+    def test_prod_custom_acc_dtype(self):
+        """
+        Test the ability to provide your own acc_dtype for a prod.
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [0], [1], [0, 1]]
+        idx = 0
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for acc_dtype in imap(str, theano.scalar.all_types):
+                axis = axes[idx % len(axes)]
+                # If acc_dtype would force a downcast, we expect a TypeError
                # We always allow int/uint inputs with float/complex outputs.
-                upcasted_dtype = scalar.upcast(input_dtype, output_dtype)
+                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
-                if (output_dtype == upcasted_dtype or
+                if (acc_dtype == upcasted_dtype or
                        (input_dtype in tensor.discrete_dtypes and
-                            output_dtype in tensor.continuous_dtypes)
+                            acc_dtype in tensor.continuous_dtypes)
                        ):
-                    prod_var = x.prod(dtype=output_dtype, axis=axis)
+                    prod_var = x.prod(acc_dtype=acc_dtype, axis=axis)
-                    assert prod_var.dtype == output_dtype
+                    assert prod_var.owner.op.acc_dtype == acc_dtype
-                    if "complex" in output_dtype:
+                    if "complex" in acc_dtype:
                        continue
                    # Check that we can take the gradient
                    grad_var = tensor.grad(prod_var.sum(), x,
                            disconnected_inputs='ignore')
                else:
                    self.assertRaises(TypeError,
-                            x.prod, dtype=output_dtype, axis=axis)
+                            x.prod, acc_dtype=acc_dtype, axis=axis)
                idx += 1
@@ -799,6 +901,26 @@ class T_prod_without_zeros_dtype(unittest.TestCase):
                    uint32='uint64',
                    ).get(dtype, dtype)
+    def test_prod_without_zeros_default_acc_dtype(self):
+        """
+        Test the default dtype of a ProdWithoutZeros().
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [0], [1], [0, 1]]
+        for idx, dtype in enumerate(imap(str, theano.scalar.all_types)):
+            axis = axes[idx % len(axes)]
+            x = ProdWithoutZeros(axis=axis)(tensor.matrix(dtype=dtype))
+            assert x.owner.op.acc_dtype == dict(
+                    int8='int64',
+                    int16='int64',
+                    int32='int64',
+                    uint8='uint64',
+                    uint16='uint64',
+                    uint32='uint64',
+                    float32='float64',
+                    complex64='complex128'
+                    ).get(dtype, dtype)
    def test_prod_without_zeros_custom_dtype(self):
        """
        Test the ability to provide your own output dtype for a ProdWithoutZeros().
@@ -810,19 +932,35 @@ class T_prod_without_zeros_dtype(unittest.TestCase):
            x = tensor.matrix(dtype=input_dtype)
            for output_dtype in imap(str, theano.scalar.all_types):
                axis = axes[idx % len(axes)]
-                # If output_dtype would force a downcast, we expect a TypeError
+                prod_woz_var = ProdWithoutZeros(
+                        axis=axis, dtype=output_dtype)(x)
+                assert prod_woz_var.dtype == output_dtype
+                idx += 1
+    def test_prod_without_zeros_custom_acc_dtype(self):
+        """
+        Test the ability to provide your own acc_dtype for a ProdWithoutZeros().
+        """
+        # We try multiple axis combinations even though axis should not matter.
+        axes = [None, 0, 1, [0], [1], [0, 1]]
+        idx = 0
+        for input_dtype in imap(str, theano.scalar.all_types):
+            x = tensor.matrix(dtype=input_dtype)
+            for acc_dtype in imap(str, theano.scalar.all_types):
+                axis = axes[idx % len(axes)]
+                # If acc_dtype would force a downcast, we expect a TypeError
                # We always allow int/uint inputs with float/complex outputs.
-                upcasted_dtype = scalar.upcast(input_dtype, output_dtype)
+                upcasted_dtype = scalar.upcast(input_dtype, acc_dtype)
-                if (output_dtype == upcasted_dtype or
+                if (acc_dtype == upcasted_dtype or
                        (input_dtype in tensor.discrete_dtypes and
-                            output_dtype in tensor.continuous_dtypes)
+                            acc_dtype in tensor.continuous_dtypes)
                        ):
                    prod_woz_var = ProdWithoutZeros(
-                            axis=axis, dtype=output_dtype)(x)
+                            axis=axis, acc_dtype=acc_dtype)(x)
-                    assert prod_woz_var.dtype == output_dtype
+                    assert prod_woz_var.owner.op.acc_dtype == acc_dtype
                else:
                    self.assertRaises(TypeError,
-                            ProdWithoutZeros(axis=axis, dtype=output_dtype),
+                            ProdWithoutZeros(axis=axis, acc_dtype=acc_dtype),
                            x)
                idx += 1