Add GpuDnnReduction.

921a0df6 · Arnaud Bergeron · ad61c7e4 · 921a0df6 · 921a0df6 · 921a0df6
--- a/theano/gpuarray/cudnn_defs.py
+++ b/theano/gpuarray/cudnn_defs.py
@@ -34,9 +34,6 @@ class CuDNNV51(object):
    cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
                                ('CUDNN_DATA_DOUBLE', 'float64'),
                                ('CUDNN_DATA_HALF', 'float16'),
-                                # CUDNN_DATA_INT8  # new in v6
-                                # CUDNN_DATA_INT32  # new in v6
-                                # CUDNN_DATA_INT8x4  # new in v6
                                ctype='cudnnDataType_t')
    cudnnConvolutionFwdAlgo_t = CEnumType(('CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM', 'none'),
@@ -96,6 +93,16 @@ class CuDNNV51(object):
 class CuDNNV6(CuDNNV51):
    version = 6
+    cudnnDataType_t = CEnumType(('CUDNN_DATA_FLOAT', 'float32'),
+                                ('CUDNN_DATA_DOUBLE', 'float64'),
+                                ('CUDNN_DATA_HALF', 'float16'),
+                                # new in v6
+                                ('CUDNN_DATA_INT8', 'int8'),
+                                ('CUDNN_DATA_INT32', 'int32'),
+                                # Also in v6, but restrictions make this fail
+                                # CUDNN_DATA_INT8x4
+                                ctype='cudnnDataType_t')
    cudnnPoolingMode_t = CEnumType(('CUDNN_POOLING_MAX', 'max'),
                                   ('CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING', 'average_inc_pad'),
                                   ('CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING', 'average_exc_pad'),
@@ -115,6 +122,16 @@ class CuDNNV6(CuDNNV51):
                                                ('CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING', 'fft_tiling'),
                                                ctype='cudnnConvolutionBwdFilterAlgo_t')
+    cudnnReduceTensorOp_t = CEnumType(('CUDNN_REDUCE_TENSOR_ADD', 'add'),
+                                      ('CUDNN_REDUCE_TENSOR_MUL', 'mul'),
+                                      ('CUDNN_REDUCE_TENSOR_MIN', 'minimum'),
+                                      ('CUDNN_REDUCE_TENSOR_MAX', 'maximum'),
+                                      ('CUDNN_REDUCE_TENSOR_AMAX', 'absmax'),
+                                      ('CUDNN_REDUCE_TENSOR_AVG', 'avg'),
+                                      ('CUDNN_REDUCE_TENSOR_NORM1', 'norm1'),
+                                      ('CUDNN_REDUCE_TENSOR_NORM2', 'norm2'),
+                                      ctype='cudnnReduceTensorOp_t')
 def get_definitions(cudnn_version=None):
    """

--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -6,10 +6,12 @@ import warnings
 import numpy as np
 from six import integer_types
+from six.moves import reduce
 import theano
 from theano import Op, Apply, tensor, config, Variable
-from theano.scalar import as_scalar, constant, Log, get_scalar_type, int32 as int_t, bool as bool_t
+from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
+                           int32 as int_t, bool as bool_t, uint32 as uint32_t)
 from theano.tensor import as_tensor_variable
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
@@ -34,7 +36,7 @@ from .type import (get_context, gpu_context_type, list_contexts,
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, GpuAllocEmpty,
                        empty_like, GpuArrayType, HostFromGpu)
-from .elemwise import GpuElemwise
+from .elemwise import GpuElemwise, GpuCAReduceCuda
 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
@@ -1557,6 +1559,71 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])
+class GpuDnnReduction(DnnBase):
+    check_input = False
+    _f16_ok = True
+    _cop_num_outputs = 2
+    __props__ = ('red_op', 'axis', 'arg', 'acc_dtype', 'dtype')
+    params_type = ParamsType(red_op=cudnn.cudnnReduceTensorOp_t,
+                             acc_dtype=cudnn.cudnnDataType_t,
+                             axis=uint32_t,
+                             handle=handle_type)
+    def __init__(self, red_op, axis, acc_dtype, dtype, arg):
+        DnnBase.__init__(self, ['dnn_redux.c'], 'APPLY_SPECIFIC(dnn_redux)')
+        assert cudnn.cudnnReduceTensorOp_t.has_alias(red_op)
+        self.red_op = red_op
+        assert acc_dtype in ['float16', 'float32', 'float64']
+        self.acc_dtype = acc_dtype
+        assert dtype in ['float16', 'float32', 'float64']
+        self.dtype = dtype
+        # 8 is the current limit for cudnn
+        if axis is not None:
+            if len(axis) > 8:
+                raise ValueError('Too many axes to reduce on')
+            if any(a >= 8 for a in axis):
+                raise ValueError('Axes larger than 8 not supported')
+        self.axis = self._convert_axis(axis)
+        if arg and (red_op != 'max' and red_op != 'min'):
+            raise ValueError("Can't request indices for something other than min or max")
+        self.arg = arg
+    def _convert_axis(self, axis):
+        if axis is None:
+            return np.uint32(-1)
+        else:
+            return reduce(lambda a, b: a | b, map(lambda a: 1 << a, axis), 0)
+    def make_node(self, inp):
+        ctx_name = infer_context_name(inp)
+        inp = as_gpuarray_variable(inp, ctx_name)
+        if inp.ndim > 8:
+            raise ValueError("cuDNN reduction doesn't support nd > 8")
+        assert inp.dtype in ['float16', 'float32', 'float64']
+        # These restrictions where guessed from vague clues since
+        # there is no actual documentation on this
+        if inp.dtype == 'float64':
+            assert self.acc_dtype == 'float64'
+        if inp.dtype == 'float32':
+            assert self.acc_dtype == 'float32'
+        if inp.dtype == 'float16':
+            assert self.acc_dtype != 'float64'
+        bcast = []
+        for i in range(inp.ndim):
+            if not (self.axis & (1 << i)):
+                bcast.append(inp.broadcastable[i])
+        outs = [inp.type.clone(dtype=self.dtype, broadcastable=bcast)()]
+        if self.arg:
+            outs.append(GpuArrayType(dtype='uint32', broadcastable=bcast,
+                                     context_name=ctx_name)())
+        return Apply(self, [inp], outs)
 class GpuDnnBatchNorm(DnnBase):
    """
    Base Op for cuDNN Batch Normalization.
@@ -2985,6 +3052,73 @@ def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    return [out.dimshuffle(0, 1)]
+@register_opt('cudnn', 'fast_compile')
+@op_lifter([SoftmaxGrad])
+@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
+def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    ins = []
+    for n in inputs:
+        n = as_gpuarray_variable(n, ctx_name)
+        if n.ndim != 2:
+            return
+        ins.append(n.dimshuffle(0, 'x', 1, 'x'))
+    out = GpuDnnSoftmaxGrad('accurate', 'instance')(
+        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
+    return [out.dimshuffle(0, 2)]
+@register_opt('cudnn')
+@local_optimizer([GpuCAReduceCuda])
+def local_dnn_reduction(node):
+    if not isinstance(node.op, GpuCAReduceCuda):
+        return
+    if not dnn_available(node.inputs[0].type.context_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if node.inputs[0].dtype != node.outputs[0].dtype:
+        # We can mix float16 and float32, but not float64.
+        if (node.inputs[0].dtype == 'float64' or
+                node.outputs[0].dtype == 'float64'):
+            return
+        if node.op.acc_dtype != 'float32':
+            return
+    if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
+        return
+    if (node.inputs[0].dtype == 'float64' and
+            node.op.acc_dtype != 'float64'):
+        return
+    if (node.inputs[0].dtype == 'float32' and
+            node.op.acc_dtype != 'float32'):
+        return
+    if (node.inputs[0].dtype == 'float16' and
+            node.op.acc_dtype == 'float64'):
+        return
+    if node.op.pre_scalar_op != None:
+        # Might want to handle absmax, avg, norm1, norm2 here
+        return
+    if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
+        return
+    return (GpuDnnReduction(node.op.scalar_op.name,
+                            node.op.axis,
+                            node.op.acc_dtype,
+                            node.op.dtype,
+                            False)(node.inputs[0]),)
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):
@@ -3004,24 +3138,6 @@ class NoCuDNNRaise(Optimizer):
 gpu_seqopt.register("NoCuDNNRaise", NoCuDNNRaise(), 0, 'cudnn')
-@register_opt('cudnn', 'fast_compile')
-@op_lifter([SoftmaxGrad])
-@register_opt2([SoftmaxGrad], 'cudnn', 'fast_compile')
-def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
-    if not dnn_available(ctx_name):
-        return
-    ins = []
-    for n in inputs:
-        n = as_gpuarray_variable(n, ctx_name)
-        if n.ndim != 2:
-            return
-        ins.append(n.dimshuffle(0, 'x', 1, 'x'))
-    out = GpuDnnSoftmaxGrad('accurate', 'instance')(
-        gpu_contiguous(ins[0]), gpu_contiguous(ins[1]))
-    return [out.dimshuffle(0, 2)]
 def local_abstract_batch_norm_train_cudnn(op, ctx_name, inputs, outputs):
    x, scale, bias, epsilon, running_average_factor = inputs[:5]
    running_mean = inputs[5] if len(inputs) > 5 else None

--- a/theano/gpuarray/dnn_redux.c
+++ b/theano/gpuarray/dnn_redux.c
+#section support_code_struct
+cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
+cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
+cudnnReduceTensorDescriptor_t APPLY_SPECIFIC(red);
+#section init_code_struct
+cudnnStatus_t APPLY_SPECIFIC(err);
+APPLY_SPECIFIC(input) = NULL;
+APPLY_SPECIFIC(output) = NULL;
+APPLY_SPECIFIC(red) = NULL;
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
+               "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+if ((APPLY_SPECIFIC(err) = cudnnCreateReduceTensorDescriptor(&APPLY_SPECIFIC(red))) != CUDNN_STATUS_SUCCESS) {
+  PyErr_Format(PyExc_MemoryError, "could not allocate reduction descriptor"
+               "(red): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
+  FAIL;
+}
+#section cleanup_code_struct
+if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
+if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
+if (APPLY_SPECIFIC(red) != NULL) { cudnnDestroyReduceTensorDescriptor(APPLY_SPECIFIC(red)); }
+#section support_code_struct
+int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
+                              PyGpuArrayObject **output,
+                              PyGpuArrayObject **indices,
+                              PARAMS_TYPE* params) {
+  PyGpuContextObject *c = input->context;
+  gpudata *workspace = NULL;
+  size_t worksize = 0;
+  size_t indsize = 0;
+  size_t *tdims;
+  ssize_t *tstrs;
+  size_t dims[8];
+  ssize_t strs[8];
+  void *alpha;
+  void *beta;
+  cudnnStatus_t err;
+  unsigned int p;
+  int e;
+  static float falpha = 1.0f;
+  static double dalpha = 1.0;
+  static float fbeta = 0.0f;
+  static double dbeta = 0.0;  
+  if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
+    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
+    return 1;
+  }
+  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
+    return 1;
+  p = 0;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    if (!(params->axis & (1U << i))) {
+      dims[p] = PyGpuArray_DIM(input, i);
+      p++;
+    }
+  }
+  if (theano_prep_output(output, p, dims, input->ga.typecode,
+                         GA_C_ORDER, c) != 0)
+    return 1;
+  if (indices != NULL) {
+    if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
+      return 1;
+    indsize = PyGpuArray_SIZE(*indices);
+  }
+  // cuDNN expect that the output has the same number of dimension as
+  // the input, but the dimensions to reduce are of size 1 in the output.
+  // We have to do some trickery to be able to pass it what it need.
+  p = 0;
+  for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
+    if (params->axis & (1U << i)) {
+      dims[i] = 1;
+      strs[i] = 0;
+    } else {
+      dims[i] = PyGpuArray_DIM(input, i);
+      strs[i] = PyGpuArray_STRIDE(*output, p);
+      p++;
+    }
+  }
+  // Perform horrible surgery to be able to reuse c_set_tensorNd()
+  tdims = (*output)->ga.dimensions;
+  tstrs = (*output)->ga.strides;
+  (*output)->ga.dimensions = dims;
+  (*output)->ga.strides = strs;
+  (*output)->ga.nd = input->ga.nd;
+  // Delay error checking to avoid exposing a broken object
+  e = c_set_tensorNd(*output, APPLY_SPECIFIC(output));
+  // Undo our horrible surgery
+  (*output)->ga.nd = p;
+  (*output)->ga.dimensions = tdims;
+  (*output)->ga.strides = tstrs;
+  if (e != 0)
+    return 1;
+  // Back to normal, no more horrible things
+  // Note that only CUDNN_32BIT_INDICES is implemented
+  err = cudnnSetReduceTensorDescriptor(
+    APPLY_SPECIFIC(red), params->red_op,
+    params->acc_dtype, CUDNN_PROPAGATE_NAN,
+    indices == NULL ? CUDNN_REDUCE_TENSOR_NO_INDICES : CUDNN_REDUCE_TENSOR_FLATTENED_INDICES,
+    CUDNN_32BIT_INDICES);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not set reduce descriptor: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  switch (input->ga.typecode) {
+  case GA_FLOAT:
+  case GA_HALF:
+    alpha = &falpha;
+    beta = &fbeta;
+    break;
+  case GA_DOUBLE:
+    alpha = &dalpha;
+    beta = &dbeta;
+    break;
+  default:
+    PyErr_SetString(PyExc_RuntimeError, "Unsupported dtype in dnn reduce");
+    return 1;
+  }
+  err = cudnnGetReductionWorkspaceSize(params->handle,
+                                       APPLY_SPECIFIC(red),
+                                       APPLY_SPECIFIC(input),
+                                       APPLY_SPECIFIC(output),
+                                       &worksize);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not get reduce workspace size: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  if (worksize != 0) {
+    workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, &e);
+    if (workspace == NULL) {
+      PyErr_Format(PyExc_RuntimeError, "gpudata_alloc: %s",
+                   gpucontext_error(c->ctx, e));
+      return 1;
+    }
+  }
+  err = cudnnReduceTensor(params->handle, APPLY_SPECIFIC(red),
+                          indices ? PyGpuArray_DEV_DATA(*indices) : NULL, indsize,
+                          worksize ? *((void **)workspace) : NULL, worksize,
+                          alpha,
+                          APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
+                          beta,
+                          APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
+  if (workspace != NULL)
+    gpudata_release(workspace);
+  if (err != CUDNN_STATUS_SUCCESS) {
+    PyErr_Format(PyExc_RuntimeError, "could not run reduction: %s",
+                 cudnnGetErrorString(err));
+    return 1;
+  }
+  return 0;
+}
--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -16,9 +16,11 @@ from theano.tensor.signal.pool import Pool, MaxPoolGrad, AveragePoolGrad
 from theano.tensor.nnet.abstract_conv import get_conv_output_shape, get_conv_gradinputs_shape
 from theano.tensor.nnet import bn
+from theano.tensor.tests import test_elemwise
 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
-from ..type import gpuarray_shared_constructor
+from ..type import gpuarray_shared_constructor, get_context
 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
 from . import test_nnet
@@ -1428,6 +1430,27 @@ class test_SoftMax(test_nnet.test_SoftMax):
        utt.assert_allclose(f(inp), f_ref(inp))
+def dnn_reduction(nd, idtype, acc_dtype, odtype):
+    inp = T.TensorType(idtype, (False,) * nd)()
+    res = inp.sum(acc_dtype=acc_dtype, dtype=odtype)
+    f = theano.function([inp], res, mode=mode_with_gpu)
+    assert any(isinstance(n.op, dnn.GpuDnnReduction)
+               for n in f.maker.fgraph.apply_nodes)
+def test_dnn_reduction_opt():
+    if not dnn.dnn_available(test_ctx_name):
+        raise SkipTest(dnn.dnn_available.msg)
+    for nd in range(1, 9):
+        yield dnn_reduction, nd, 'float32', 'float32', 'float32'
+    for idtype, adtype, odtype in (('float64', 'float64', 'float64'),
+                                   ('float16', 'float32', 'float16'),
+                                   ('float16', 'float32', 'float32')):
+        yield dnn_reduction, 2, idtype, adtype, odtype
 def test_dnn_batchnorm_train():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/gpuarray/tests/test_elemwise.py
+++ b/theano/gpuarray/tests/test_elemwise.py
@@ -16,8 +16,10 @@ from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
 from .test_basic_ops import rand_gpuarray
 from ..elemwise import (GpuElemwise, GpuDimShuffle,
                        GpuCAReduceCuda, GpuCAReduceCPY, GpuErfinv, GpuErfcinv)
+from ..dnn import GpuDnnReduction
 from ..type import GpuArrayType, get_context, gpuarray_shared_constructor
 from pygpu import ndgpuarray as gpuarray
@@ -346,7 +348,9 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
 class T_gpureduce_dtype(test_elemwise.T_reduce_dtype):
    mode = mode_with_gpu.excluding('local_cut_useless_reduce')
-    op = GpuCAReduceCuda
+    # GpuDnnReduction doesn't cover all cases, but should cover some
+    op = (GpuCAReduceCuda, GpuDnnReduction)
    # Currently we don't support reduction on 0 axis
    axes = [None, 0, 1, 1, [0], [1], [0, 1]]
    # We don't support complex dtype