Merge pull request #6069 from abergeron/dnn_redux

Add reduction based on cudnn.

Merge pull request #6069 from abergeron/dnn_redux
72623e6c · Frédéric Bastien · GitHub · 3796417a · bbb47621 · 72623e6c
--- a/theano/compile/tests/test_debugmode.py
+++ b/theano/compile/tests/test_debugmode.py
@@ -243,10 +243,8 @@ def test_badoptimization():
 def test_badoptimization_opt_err():
-    """This variant of test_badoptimization() replace the working code
+    # This variant of test_badoptimization() replace the working code
-    with a new apply node that will raise an error.
+    # with a new apply node that will raise an error.
-    """
    @gof.local_optimizer([theano.tensor.add])
    def insert_bigger_b_add(node):
        if node.op == theano.tensor.add:

--- a/theano/gof/type.py
+++ b/theano/gof/type.py
@@ -634,7 +634,7 @@ class _make_cdata(Op):
        """ % dict(ctype=self.rtype.ctype, out=outputs[0], inp=inputs[0])
    def c_code_cache_version(self):
-        return (0,)
+        return (0, self.rtype.version)
 class CDataType(Type):

--- a/theano/gpuarray/cudnn_defs.py
+++ b/theano/gpuarray/cudnn_defs.py
@@ -34,9 +34,6 @@ class CuDNNV51(object):
    cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
                                ('CUDNN_DATA_DOUBLE', 'float64'),
                                ('CUDNN_DATA_HALF', 'float16'),
-                                # CUDNN_DATA_INT8  # new in v6
-                                # CUDNN_DATA_INT32  # new in v6
-                                # CUDNN_DATA_INT8x4  # new in v6
                                ctype='cudnnDataType_t')
    cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
@@ -91,11 +88,24 @@ class CuDNNV51(object):
    cudnnBatchNormMode_t = CEnumType(('CUDNN_BATCHNORM_PER_ACTIVATION', 'per-activation'),
                                     ('CUDNN_BATCHNORM_SPATIAL', 'spatial'),
                                     ctype='cudnnBatchNormMode_t')
+    # It was introduced in cudnnv6, but we need to define it with an
+    # empty list of enum to don't crash with cudnn 5
+    cudnnReduceTensorOp_t = CEnumType()
 class CuDNNV6(CuDNNV51):
    version = 6
+    cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
+                                ('CUDNN_DATA_DOUBLE', 'float64'),
+                                ('CUDNN_DATA_HALF', 'float16'),
+                                # new in v6
+                                ('CUDNN_DATA_INT8', 'int8'),
+                                ('CUDNN_DATA_INT32', 'int32'),
+                                # Also in v6, but restrictions make this fail
+                                # CUDNN_DATA_INT8x4
+                                ctype='cudnnDataType_t')
    cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
                                   ('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
                                   ('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
@@ -115,6 +125,16 @@ class CuDNNV6(CuDNNV51):
                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
                                                ctype='cudnnConvolutionBwdFilterAlgo_t')
+    cudnnReduceTensorOp_t = CEnumType(('CUDNN_REDUCE_TENSOR_ADD', 'add'),
+                                      ('CUDNN_REDUCE_TENSOR_MUL', 'mul'),
+                                      ('CUDNN_REDUCE_TENSOR_MIN', 'minimum'),
+                                      ('CUDNN_REDUCE_TENSOR_MAX', 'maximum'),
+                                      ('CUDNN_REDUCE_TENSOR_AMAX', 'absmax'),
+                                      ('CUDNN_REDUCE_TENSOR_AVG', 'avg'),
+                                      ('CUDNN_REDUCE_TENSOR_NORM1', 'norm1'),
+                                      ('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
+                                      ctype='cudnnReduceTensorOp_t')
 def get_definitions(cudnn_version=None):
    """

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -6,10 +6,12 @@ import warnings
 import numpy as np
 from six import integer_types
+from six.moves import reduce
 import theano
 from theano import Op, Apply, tensor, config, Variable
-from theano.scalar import as_scalar, constant, Log, get_scalar_type, int32 as int_t, bool as bool_t
+from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
+                           int32 as int_t, bool as bool_t, uint32 as uint32_t)
 from theano.tensor import as_tensor_variable
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
@@ -34,7 +36,7 @@ from .type import (get_context, gpu_context_type, list_contexts,
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, GpuAllocEmpty,
                        empty_like, GpuArrayType, HostFromGpu)
-from .elemwise import GpuElemwise
+from .elemwise import GpuElemwise, GpuCAReduceCuda
 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
@@ -433,7 +435,8 @@ class GpuDnnConvDesc(COp):
        node = Apply(self, [kern_shape],
                     [CDataType("cudnnConvolutionDescriptor_t",
-                                freefunc="cudnnDestroyConvolutionDescriptor")()])
+                                freefunc="cudnnDestroyConvolutionDescriptor",
+                                version=version(raises=False))()])
        # DebugMode cannot compare the values of CDataType variables, so by
        # default it returns False all the time. To prevent DebugMode from
        # complaining because of the MergeOptimizer, we make this variable
@@ -1216,7 +1219,8 @@ class GpuDnnPoolDesc(Op):
    def make_node(self):
        node = Apply(self, [],
                     [CDataType("cudnnPoolingDescriptor_t",
-                                freefunc="cudnnDestroyPoolingDescriptor")()])
+                                freefunc="cudnnDestroyPoolingDescriptor",
+                                version=version(raises=False))()])
        # DebugMode cannot compare the values of CDataType variables, so by
        # default it returns False all the time. To prevent DebugMode from
        # complaining because of the MergeOptimizer, we make this variable
@@ -1557,6 +1561,75 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])
+class GpuDnnReduction(DnnBase):
+    check_input = False
+    _f16_ok = True
+    _cop_num_outputs = 2
+    __props__ = ('red_op', 'axis', 'acc_dtype', 'dtype', 'return_indices')
+    params_type = ParamsType(red_op=cudnn.cudnnReduceTensorOp_t,
+                             acc_dtype=cudnn.cudnnDataType_t,
+                             c_axis=uint32_t,
+                             handle=handle_type)
+    def __init__(self, red_op, axis, acc_dtype, dtype, return_indices):
+        DnnBase.__init__(self, ['dnn_redux.c'], 'APPLY_SPECIFIC(dnn_redux)')
+        assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op)
+        self.red_op = red_op
+        assert acc_dtype in ['float16', 'float32', 'float64']
+        self.acc_dtype = acc_dtype
+        assert dtype in ['float16', 'float32', 'float64']
+        self.dtype = dtype
+        # 8 is the current limit for cudnn
+        if axis is not None:
+            if len(axis) > 8:
+                raise ValueError('Too many axes to reduce on')
+            if any(a >= 8 for a in axis):
+                raise ValueError('Axes larger than 8 not supported')
+            axis = tuple(axis)
+        # c_axis is a bitfield (1 to reduce)
+        self.c_axis = self._convert_axis(axis)
+        # axis is a list of axes to reduce on
+        self.axis = axis
+        if return_indices and (red_op != 'max' and red_op != 'min'):
+            raise ValueError("Can't request indices for something other than min or max")
+        self.return_indices = return_indices
+    def _convert_axis(self, axis):
+        if axis is None:
+            return np.uint32(-1)
+        else:
+            return reduce(lambda a, b: a | b, map(lambda a: 1 << a, axis), 0)
+    def make_node(self, inp):
+        ctx_name = infer_context_name(inp)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        if inp.ndim > 8:
+            raise ValueError("cuDNN reduction doesn't support nd > 8")
+        assert inp.dtype in ['float16', 'float32', 'float64']
+        # These restrictions where guessed from vague clues since
+        # there is no actual documentation on this
+        if inp.dtype == 'float64':
+            assert self.acc_dtype == 'float64'
+        if inp.dtype == 'float32':
+            assert self.acc_dtype == 'float32'
+        if inp.dtype == 'float16':
+            assert self.acc_dtype != 'float64'
+        bcast = []
+        for i in range(inp.ndim):
+            if not (self.c_axis & (1 << i)):
+                bcast.append(inp.broadcastable[i])
+        outs = [inp.type.clone(dtype=self.dtype, broadcastable=bcast)()]
+        if self.return_indices:
+            outs.append(GpuArrayType(dtype='uint32', broadcastable=bcast,
+                                     context_name=ctx_name)())
+        return Apply(self, [inp], outs)
 class GpuDnnBatchNorm(DnnBase):
    """
    Base Op for cuDNN Batch Normalization.
@@ -1812,7 +1885,8 @@ class GpuDnnBatchNormGrad(DnnBase):
 gpudata_type = CDataType('gpudata *', 'gpudata_release')
 dropoutdesc_type = CDataType('cudnnDropoutDescriptor_t',
-                             'cudnnDestroyDropoutDescriptor')
+                             'cudnnDestroyDropoutDescriptor',
+                             version=version(raises=False))
 class GpuDnnDropoutOp(DnnBase):
@@ -1881,7 +1955,8 @@ def dropout(x, dropout=0.0, seed=4242):
    return y, desc, odesc, states
 rnndesc_type = CDataType('cudnnRNNDescriptor_t',
-                         'cudnnDestroyRNNDescriptor')
+                         'cudnnDestroyRNNDescriptor',
+                         version=version(raises=False))
 def as_i32(v):
@@ -2985,6 +3060,76 @@ def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    return [out.dimshuffle(0, 1)]
+@register_opt('cudnn', 'fast_compile')
+@op_lifter([SoftmaxGrad])
+@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
+def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    ins = []
+    for n in inputs:
+        n = as_gpuarray_variable(n, ctx_name)
+        if n.ndim != 2:
+            return
+        ins.append(n.dimshuffle(0, 'x', 1, 'x'))
+    out = GpuDnnSoftmaxGrad('accurate', 'instance')(
+        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
+    return [out.dimshuffle(0, 2)]
+@register_opt('cudnn')
+@local_optimizer([GpuCAReduceCuda])
+def local_dnn_reduction(node):
+    if not isinstance(node.op, GpuCAReduceCuda):
+        return
+    if not dnn_available(node.inputs[0].type.context_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if node.inputs[0].ndim > 8:
+        return
+    if node.inputs[0].dtype != node.outputs[0].dtype:
+        # We can mix float16 and float32, but not float64.
+        if (node.inputs[0].dtype == 'float64' or
+                node.outputs[0].dtype == 'float64'):
+            return
+        if node.op.acc_dtype != 'float32':
+            return
+    if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
+        return
+    if (node.inputs[0].dtype == 'float64' and
+            node.op.acc_dtype != 'float64'):
+        return
+    if (node.inputs[0].dtype == 'float32' and
+            node.op.acc_dtype != 'float32'):
+        return
+    if (node.inputs[0].dtype == 'float16' and
+            node.op.acc_dtype == 'float64'):
+        return
+    if node.op.pre_scalar_op is not None:
+        # Might want to handle absmax, avg, norm1, norm2 here
+        return
+    if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
+        return
+    return (GpuDnnReduction(node.op.scalar_op.name,
+                            node.op.axis,
+                            node.op.acc_dtype,
+                            node.op.dtype,
+                            False)(node.inputs[0]),)
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
@@ -3004,24 +3149,6 @@ class NoCuDNNRaise(Optimizer):
 gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
-@register_opt('cudnn', 'fast_compile')
-@op_lifter([SoftmaxGrad])
-@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
-def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    ins = []
-    for n in inputs:
-        n = as_gpuarray_variable(n, ctx_name)
-        if n.ndim != 2:
-            return
-        ins.append(n.dimshuffle(0, 'x', 1, 'x'))
-    out = GpuDnnSoftmaxGrad('accurate', 'instance')(
-        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
-    return [out.dimshuffle(0, 2)]
 def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
    x, scale, bias, epsilon, running_average_factor = inputs[:5]
    running_mean = inputs[5] if len(inputs) > 5 else None

--- a/theano/gpuarray/dnn_pool.c
+++ b/theano/gpuarray/dnn_pool.c
@@ -94,6 +94,7 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
  if (err != CUDNN_STATUS_SUCCESS) {
    PyErr_Format(PyExc_RuntimeError, "could not set op descriptor %s", cudnnGetErrorString(err));
+    return 1;
  }
  {

--- a/theano/gpuarray/dnn_redux.c
+++ b/theano/gpuarray/dnn_redux.c
+#section support_code_struct
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
+#section init_code_struct
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(red) = NULL;
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate reduction descriptor"
+               "(red): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+#section cleanup_code_struct
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }
+#section support_code_struct
+int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
+                              PyGpuArrayObject **output,
+                              PyGpuArrayObject **indices,
+                              PARAMS_TYPE* params) {
+  PyGpuContextObject *c = input->context;
+  gpudata *workspace = NULL;
+  size_t worksize = 0;
+  size_t indsize = 0;
+  size_t *tdims;
+  ssize_t *tstrs;
+  size_t dims[8];
+  ssize_t strs[8];
+  size_t rsz;
+  void *alpha;
+  void *beta;
+  cudnnStatus_t err;
+  unsigned int p;
+  int e;
+  static float falpha = 1.0f;
+  static double dalpha = 1.0;
+  static float fbeta = 0.0f;
+  static double dbeta = 0.0;  
+  if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+  p = 0;
+  rsz = 1;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    if (!(params->c_axis & (1U << i))) {
+      dims[p] = PyGpuArray_DIM(input, i);
+      p++;
+    } else {
+      rsz *= PyGpuArray_DIM(input, i);
+    }
+  }
+  if (indices != NULL) {
+    if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
+      return 1;
+    indsize = PyGpuArray_SIZE(*indices);
+  }
+  if (p == input->ga.nd || rsz == 1) {
+    int err;
+    Py_XDECREF(*output);
+    *output = pygpu_copy(input, GA_C_ORDER);
+    if (*output == NULL)
+      return 1;
+    err = GpuArray_reshape_inplace(&(*output)->ga, p, dims, GA_C_ORDER);
+    if (err != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "GpuArray_reshape_inplace: %s", GpuArray_error(&(*output)->ga, err));
+      return 1;
+    }
+    if (indices != NULL) {
+      // All indices will be 0 since the size of the reduced area is 1.
+      err = GpuArray_memset(&(*indices)->ga, 0);
+      if (err != GA_NO_ERROR) {
+        PyErr_Format(PyExc_RuntimeError, "GpuArray_memset: %s", GpuArray_error(&(*indices)->ga, err));
+        return 1;
+      }
+    }
+    // This is a shortcut path.
+    return 0;
+  }
+  if (theano_prep_output(output, p, dims, input->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+  // cuDNN expect that the output has the same number of dimension as
+  // the input, but the dimensions to reduce are of size 1 in the output.
+  // We have to do some trickery to be able to pass it what it need.
+  p = 0;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    if (params->c_axis & (1U << i)) {
+      dims[i] = 1;
+      strs[i] = 0;
+    } else {
+      dims[i] = PyGpuArray_DIM(input, i);
+      strs[i] = PyGpuArray_STRIDE(*output, p);
+      p++;
+    }
+  }
+  // Perform horrible surgery to be able to reuse c_set_tensorNd()
+  tdims = (*output)->ga.dimensions;
+  tstrs = (*output)->ga.strides;
+  (*output)->ga.dimensions = dims;
+  (*output)->ga.strides = strs;
+  (*output)->ga.nd = input->ga.nd;
+  // Delay error checking to avoid exposing a broken object
+  e = c_set_tensorNd(*output, APPLY_SPECIFIC(output));
+  // Undo our horrible surgery
+  (*output)->ga.nd = p;
+  (*output)->ga.dimensions = tdims;
+  (*output)->ga.strides = tstrs;
+  if (e != 0)
+    return 1;
+  // Back to normal, no more horrible things
+  // Note that only CUDNN_32BIT_INDICES is implemented
+  err = cudnnSetReduceTensorDescriptor(
+    APPLY_SPECIFIC(red), params->red_op,
+    params->acc_dtype, CUDNN_PROPAGATE_NAN,
+    indices == NULL ? CUDNN_REDUCE_TENSOR_NO_INDICES : CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
+    CUDNN_32BIT_INDICES);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set reduce descriptor: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  switch (input->ga.typecode) {
+  case GA_FLOAT:
+  case GA_HALF:
+    alpha = &falpha;
+    beta = &fbeta;
+    break;
+  case GA_DOUBLE:
+    alpha = &dalpha;
+    beta = &dbeta;
+    break;
+  default:
+    PyErr_SetString(PyExc_RuntimeError, "Unsupported dtype in dnn reduce");
+    return 1;
+  }
+  err = cudnnGetReductionWorkspaceSize(params->handle,
+                                       APPLY_SPECIFIC(red),
+                                       APPLY_SPECIFIC(input),
+                                       APPLY_SPECIFIC(output),
+                                       &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not get reduce workspace size: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (worksize != 0) {
+    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, &e);
+    if (workspace == NULL) {
+      PyErr_Format(PyExc_RuntimeError, "gpudata_alloc: %s",
+                   gpucontext_error(c->ctx, e));
+      return 1;
+    }
+  }
+  err = cudnnReduceTensor(params->handle, APPLY_SPECIFIC(red),
+                          indices ? PyGpuArray_DEV_DATA(*indices) : NULL, indsize,
+                          worksize ? *((void **)workspace) : NULL, worksize,
+                          alpha,
+                          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+                          beta,
+                          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not run reduction: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1119,10 +1119,20 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
        else:
            return False
        x, = inputs
+        idtype = x.dtype
+        adtype = getattr(op, 'acc_dtype', None)
+        odtype = getattr(op, 'dtype', outputs[0].dtype)
+        # Force accumulator to float32 for float32 inputs since tree
+        # reduction will not loose as much precision as linear
+        # accumulation and float64 is much slower on GPU.
+        if idtype == 'float32' and odtype == 'float32':
+            adtype = 'float32'
        greduce = op2(
            op.scalar_op, axis=op.axis,
-            dtype=getattr(op, 'dtype', outputs[0].dtype),
+            dtype=odtype,
-            acc_dtype=getattr(op, 'acc_dtype', None))
+            acc_dtype=adtype)
        gvar = greduce(x)
        # We need to have the make node called, otherwise the mask can
        # be None

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1485,6 +1485,27 @@ class test_SoftMax(test_nnet.test_SoftMax):
        utt.assert_allclose(f(inp), f_ref(inp))
+def dnn_reduction(nd, idtype, acc_dtype, odtype):
+    inp = T.TensorType(idtype, (False,) * nd)()
+    res = inp.sum(acc_dtype=acc_dtype, dtype=odtype)
+    f = theano.function([inp], res, mode=mode_with_gpu)
+    assert any(isinstance(n.op, dnn.GpuDnnReduction)
+               for n in f.maker.fgraph.apply_nodes)
+def test_dnn_reduction_opt():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    for nd in range(1, 9):
+        yield dnn_reduction, nd, 'float32', 'float32', 'float32'
+    for idtype, adtype, odtype in (('float64', 'float64', 'float64'),
+                                   ('float16', 'float32', 'float16'),
+                                   ('float16', 'float32', 'float32')):
+        yield dnn_reduction, 2, idtype, adtype, odtype
 def test_dnn_batchnorm_train():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/gpuarray/tests/test_elemwise.py
+++ b/theano/gpuarray/tests/test_elemwise.py
@@ -16,8 +16,10 @@ from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from .test_basic_ops import rand_gpuarray
 from ..elemwise import (GpuElemwise, GpuDimShuffle,
                        GpuCAReduceCuda, GpuCAReduceCPY, GpuErfinv, GpuErfcinv)
+from ..dnn import GpuDnnReduction
 from ..type import GpuArrayType, get_context, gpuarray_shared_constructor
 from pygpu import ndgpuarray as gpuarray
@@ -346,7 +348,9 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
 class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
-    op = GpuCAReduceCuda
+    # GpuDnnReduction doesn't cover all cases, but should cover some
+    op = (GpuCAReduceCuda, GpuDnnReduction)
    # Currently we don't support reduction on 0 axis
    axes = [None, 0, 1, 1, [0], [1], [0, 1]]
    # We don't support complex dtype

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -17,6 +17,7 @@ from ..basic_ops import (
 from ..blas import GpuGemm
 from ..elemwise import (
    GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise, Elemwise, max_inputs_to_GpuElemwise)
+from ..dnn import GpuDnnReduction
 from ..subtensor import GpuSubtensor
 from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
@@ -130,9 +131,13 @@ def test_reduce():
        ops = [type(node.op) for node in topo]
        if kind == b'opencl' and method in ["max", "min"]:
-            assert not(GpuCAReduceCuda in ops or GpuCAReduceCPY in ops)
+            assert not(GpuCAReduceCuda in ops or
+                       GpuCAReduceCPY in ops or
+                       GpuDnnReduction in ops)
        else:
-            assert GpuCAReduceCuda in ops or GpuCAReduceCPY in ops
+            assert (GpuCAReduceCuda in ops or
+                    GpuCAReduceCPY in ops or
+                    GpuDnnReduction in ops)
 def test_local_gpualloc_memset_0():

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py