Merge pull request #6116 from abergeron/dnn_redux2

Use GpuDnnReduction to replace GpuMaxAndArgmax when possible.

Merge pull request #6116 from abergeron/dnn_redux2
ebf8f12a · Pascal Lamblin · GitHub · 7ed9fb90 · f1acf82a · ebf8f12a
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -12,7 +12,7 @@ import theano
 from theano import Op, Apply, tensor, config, Variable
 from theano.scalar import (as_scalar, constant, Log, get_scalar_type,
                           int32 as int_t, bool as bool_t, uint32 as uint32_t)
-from theano.tensor import as_tensor_variable
+from theano.tensor import as_tensor_variable, Argmax
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp, ParamsType, EnumList
 from theano.gof.cmodule import GCC_compiler
@@ -37,6 +37,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, GpuAllocEmpty,
                        empty_like, GpuArrayType, HostFromGpu)
 from .elemwise import GpuElemwise, GpuCAReduceCuda
+from .reduction import GpuMaxAndArgmax
 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
@@ -1592,8 +1593,9 @@ class GpuDnnReduction(DnnBase):
        self.c_axis = self._convert_axis(axis)
        # axis is a list of axes to reduce on
        self.axis = axis
-        if return_indices and (red_op != 'max' and red_op != 'min'):
+        if return_indices and (red_op != 'maximum' and red_op != 'minimum'):
-            raise ValueError("Can't request indices for something other than min or max")
+            raise ValueError("Can't request indices for something other than"
+                             " minimum or maximum")
        self.return_indices = return_indices
    def _convert_axis(self, axis):
@@ -1897,7 +1899,7 @@ class GpuDnnDropoutOp(DnnBase):
        return Apply(self, [inp, descriptor, state],
                     [inp.type(), state.type(), gpudata_type()])
-    def prepare_node(self, node, storage_map, compute_map):
+    def prepare_node(self, node, storage_map, compute_map, impl):
        assert self.inplace, "GpuDnnDropoutOp not inplace"
@@ -3123,6 +3125,66 @@ def local_dnn_reduction(node):
                            False)(node.inputs[0]),)
+@register_opt('cudnn')
+@local_optimizer([GpuMaxAndArgmax])
+def local_cudnn_maxandargmax(node):
+    if not isinstance(node.op, GpuMaxAndArgmax):
+        return
+    if not dnn_available(node.inputs[0].type.context_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if node.inputs[0].ndim > 8:
+        return
+    if node.inputs[0].dtype != node.outputs[0].dtype:
+        return
+    if node.inputs[0].dtype not in ['float16', 'float32', 'float64']:
+        return
+    # order of the axes influences the output indices
+    if (node.op.axis is not None and
+            tuple(sorted(node.op.axis)) != node.op.axis):
+        return
+    max, arg = GpuDnnReduction('maximum', node.op.axis, node.outputs[0].dtype,
+                               node.outputs[0].dtype, True)(node.inputs[0])
+    # cudnn can only return int32 indices
+    return (max, as_gpuarray_variable(arg.astype('int64'),
+                                      node.outputs[1].type.context_name))
+@register_opt('cudnn', 'fast_compile')
+@op_lifter([Argmax])
+@register_opt2([Argmax], 'fast_compile', 'cudnn')
+def local_dnn_argmax(op, ctx_name, inputs, outputs):
+    if not dnn_available(ctx_name):
+        return
+    if version(raises=False) < 6000:
+        return
+    if inputs[0].ndim > 8:
+        return
+    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
+        return
+    # order of the axes influences the output indices
+    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
+        return
+    max, arg = GpuDnnReduction('maximum', op.axis, inputs[0].dtype,
+                               inputs[0].dtype, True)(*inputs)
+    return [as_gpuarray_variable(arg.astype('int64'), ctx_name)]
 class NoCuDNNRaise(Optimizer):
    def apply(self, fgraph):

--- a/theano/gpuarray/dnn_redux.c
+++ b/theano/gpuarray/dnn_redux.c
@@ -61,11 +61,6 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
  static float fbeta = 0.0f;
  static double dbeta = 0.0;  
-  if (!GpuArray_IS_C_CONTIGUOUS(&input->ga)) {
-    PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
-    return 1;
-  }
  if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) != 0)
    return 1;
@@ -83,7 +78,7 @@ int APPLY_SPECIFIC(dnn_redux)(PyGpuArrayObject *input,
  if (indices != NULL) {
    if (theano_prep_output(indices, p, dims, GA_UINT, GA_C_ORDER, c) != 0)
      return 1;
-    indsize = PyGpuArray_SIZE(*indices);
+    indsize = PyGpuArray_SIZE(*indices) * 4;
  }
  if (p == input->ga.nd || rsz == 1) {

--- a/theano/gpuarray/reduction.py
+++ b/theano/gpuarray/reduction.py
@@ -37,8 +37,8 @@ class GpuMaxAndArgmax(Op):
        broadcastable = [b for i, b in enumerate(X.type.broadcastable)
                         if i not in all_axes]
        inputs = [as_gpuarray_variable(X, context_name)]
-        outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
+        outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name)(),
-                   GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
+                   GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name)()]
        return Apply(self, inputs, outputs)
    def c_headers(self):

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -18,7 +18,7 @@ from theano.tensor.nnet import bn
 from .. import dnn
 from ..basic_ops import GpuAllocEmpty
-from ..type import gpuarray_shared_constructor
+from ..type import gpuarray_shared_constructor, GpuArrayType
 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, ref_cast
 from . import test_nnet
@@ -26,6 +26,11 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
 from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
+try:
+    import pygpu
+except ImportError:
+    pass
 mode_with_gpu = mode_with_gpu.including()
 # Globally disabled for mode_without_gpu
 mode_with_gpu.check_py_code = False
@@ -1506,6 +1511,55 @@ def test_dnn_reduction_opt():
        yield dnn_reduction, 2, idtype, adtype, odtype
+def dnn_reduction_strides(shp, shuffle, slice):
+    utt.fetch_seed()
+    inp = GpuArrayType('float32', (False,) * len(shp),
+                       context_name=test_ctx_name)()
+    tmp = inp.dimshuffle(shuffle)[slice]
+    res = tmp.sum(acc_dtype='float32', dtype='float32')
+    f = theano.function([inp], res, mode=mode_with_gpu)
+    assert any(isinstance(n.op, dnn.GpuDnnReduction)
+               for n in f.maker.fgraph.apply_nodes)
+    data = np.random.random(shp).astype('float32')
+    res = np.sum(data)
+    gdata = pygpu.array(data, context=inp.type.context)
+    gres = f(gdata)
+    utt.assert_allclose(res, np.array(gres))
+def test_dnn_reduction_strides():
+    yield dnn_reduction_strides, (2, 3, 2), (1, 0, 2), slice(None, None, None)
+    yield dnn_reduction_strides, (2, 3, 2), (0, 1, 2), slice(None, None, -1)
+def dnn_maxargmax(nd, idtype, axis):
+    inp = T.TensorType(idtype, (False,) * nd)()
+    res = T.max_and_argmax(inp, axis=axis)
+    f = theano.function([inp], res, mode=mode_with_gpu)
+    assert any(isinstance(n.op, dnn.GpuDnnReduction)
+               for n in f.maker.fgraph.apply_nodes)
+def test_dnn_maxandargmax_opt():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    for nd in range(1, 9):
+        yield dnn_maxargmax, nd, 'float32', None
+    for idtype in ('float64', 'float16'):
+        yield dnn_maxargmax, 2, idtype, None
+    yield dnn_maxargmax, 3, 'float32', (0, 1)
+    yield dnn_maxargmax, 3, 'float32', (0, 2)
+    yield dnn_maxargmax, 3, 'float32', (1, 2)
+    yield dnn_maxargmax, 3, 'float32', (0, 1, 2)
+    yield dnn_maxargmax, 3, 'float32', (0,)
+    yield dnn_maxargmax, 3, 'float32', (1,)
+    yield dnn_maxargmax, 3, 'float32', (2,)
+    yield dnn_maxargmax, 3, 'float32', ()
 def test_dnn_batchnorm_train():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)

--- a/theano/gpuarray/tests/test_reduction.py
+++ b/theano/gpuarray/tests/test_reduction.py
@@ -10,6 +10,8 @@ from theano.tests.unittest_tools import SkipTest
 from .config import mode_with_gpu, mode_without_gpu
 from .test_basic_ops import rand_gpuarray
 from .. import GpuArrayType
+from ..reduction import GpuMaxAndArgmax
+from ..dnn import GpuDnnReduction
 import math
@@ -53,14 +55,14 @@ def numpy_maxandargmax(X, axis=None):
    return (ref_max, np.argmax(reshaped_x, axis=-1))
-def check_if_gpu_maxandargmax_in_graph(theano_function):
+def check_if_gpu_reduce_in_graph(theano_function):
-    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+    assert any(isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
-                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
+               for node in theano_function.maker.fgraph.apply_nodes)
-def check_if_gpu_maxandargmax_not_in_graph(theano_function):
+def check_if_gpu_reduce_not_in_graph(theano_function):
-    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+    assert all(not isinstance(node.op, (GpuMaxAndArgmax, GpuDnnReduction))
-                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
+               for node in theano_function.maker.fgraph.apply_nodes)
 class BaseTest:
@@ -105,7 +107,7 @@ class BaseTest:
        M = self.get_host_tensor()
        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
                            name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu)
-        check_if_gpu_maxandargmax_not_in_graph(f)
+        check_if_gpu_reduce_not_in_graph(f)
        f(test_tensor)
        theano_max, theano_argmax = f(test_tensor)
        ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
@@ -116,7 +118,7 @@ class BaseTest:
        M = self.get_gpu_tensor()
        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
                            name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu)
-        check_if_gpu_maxandargmax_in_graph(f)
+        check_if_gpu_reduce_in_graph(f)
        f(test_gpu_tensor)
        theano_max, theano_argmax = f(test_gpu_tensor)
        ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -14,7 +14,7 @@ import theano
 from theano.compat import izip
 from theano.configparser import config
 from theano import gof
-from theano.gof import Apply, Constant, Op, Variable
+from theano.gof import Apply, Constant, Op, Variable, ParamsType
 from theano.gof.type import Generic
 from theano.tensor import elemwise
@@ -1429,21 +1429,31 @@ class Argmax(Op):
    nin = 2  # tensor, axis
    nout = 1
    E_axis = 'invalid axis'
-    __props__ = ()
+    __props__ = ('axis',)
    _f16_ok = True
+    params_type = ParamsType(c_axis=scal.int64)
+    def __init__(self, axis):
+        if axis is not None:
+            axis = tuple(axis)
+        self.axis = tuple(axis)
+    def get_params(self, node):
+        if self.axis is not None and len(self.axis) == 1:
+            c_axis = np.int64(self.axis[0])
+        else:
+            # The value here doesn't matter, it won't be used
+            c_axis = np.int64(-1)
+        return self.params_type.get_params(c_axis=c_axis)
    def make_node(self, x, axis=None):
        x = _as_tensor_variable(x)
-        # Check axis and convert it to a Python list of integers.
+        if self.axis is None:
-        axis = check_and_normalize_axes(x, axis)
-        if len(axis) == 0:
-            axis = NoneConst.clone()
            all_axes = list(range(x.ndim))
        else:
-            all_axes = axis
+            all_axes = self.axis
-            axis = _as_tensor_variable(axis)
+        inputs = [x]
-            assert axis.ndim == 1
-        inputs = [x, axis]
        # We keep the original broadcastable flags for dimensions on which
        # we do not perform the argmax.
@@ -1452,13 +1462,16 @@ class Argmax(Op):
        outputs = [tensor('int64', broadcastable, name='argmax')]
        return Apply(self, inputs, outputs)
-    def perform(self, node, inp, outs):
+    def prepare_node(self, node, storage_map, compute_map, impl):
-        x, axes = inp
+        if len(node.inputs) == 2:
+            raise ValueError('You are trying to compile a graph with an old Argmax node.  Either reoptimize your graph or rebuild it to get the new node format.')
+    def perform(self, node, inp, outs, params):
+        x, = inp
+        axes = self.axis
        max_idx, = outs
        if axes is None:
            axes = tuple(range(x.ndim))
-        else:
-            axes = tuple(int(ax) for ax in axes)
        # Numpy does not support multiple axes for argmax
        # Work around
@@ -1476,18 +1489,18 @@ class Argmax(Op):
                                     dtype='int64')
    def c_code(self, node, name, inp, out, sub):
-        x, axis = inp
+        x, = inp
        argmax, = out
        fail = sub["fail"]
-        if NoneConst.equals(node.inputs[1]):
+        params = sub["params"]
+        if self.axis is None:
            axis_code = "axis = NPY_MAXDIMS;"
        else:
-            assert node.inputs[1].ndim == 1
+            if len(self.axis) > 1:
-            # Fall back to perform() if there are multiple axes
-            if len(node.inputs[1].data) > 1:
                raise NotImplementedError()
+            # params is only used here for now
            axis_code = """
-            axis = ((dtype_%(axis)s*)PyArray_DATA(%(axis)s))[0];
+            axis = %(params)s->c_axis;
            if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){
                PyErr_SetString(PyExc_ValueError,
                "Argmax, bad axis argument");
@@ -1522,28 +1535,20 @@ class Argmax(Op):
        return ret % locals()
    def c_code_cache_version(self):
-        return (0,)
+        return (1,)
    def infer_shape(self, node, shapes):
-        ishape, axis_shape = shapes
+        ishape, = shapes
-        axis = node.inputs[1]
+        if self.axis is None:
-        if axis.data is None:
            return [()]
        rval = tuple([ishape[i] for (i, b) in enumerate(
-            node.inputs[0].type.broadcastable) if i not in axis.data])
+            node.inputs[0].type.broadcastable) if i not in self.axis])
        return [rval]
    def grad(self, inp, grads):
-        x, axis = inp
+        x, = inp
-        axis_grad = grad_undefined(
-            self, 1, axis,
-            "argmax is not defined for non-integer axes so"
-            " argmax(x, axis+eps) is undefined")
-        return [x.zeros_like(), axis_grad]
-_argmax = Argmax()
+        return [x.zeros_like()]
 def makeKeepDims(x, y, axis):

--- a/theano/tensor/nnet/tests/test_nnet.py
+++ b/theano/tensor/nnet/tests/test_nnet.py
@@ -1333,9 +1333,9 @@ def test_argmax_pushdown():
        # for node in fgraph.toposort():
        # print node.op
        assert len(fgraph.toposort()) == 1
-        assert fgraph.toposort()[0].op == tensor.basic._argmax
+        assert isinstance(fgraph.toposort()[0].op, tensor.basic.Argmax)
        assert check_stack_trace(
-            fgraph, ops_to_check=tensor.basic._argmax)
+            fgraph, ops_to_check=tensor.basic.Argmax)
        x = tensor.matrix()
        # test that the max_and_argmax is not pushed down if the max is used
        out = tensor.max_and_argmax(

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -60,7 +60,7 @@ def local_max_and_argmax(node):
            return [new, None]
        if len(node.outputs[0].clients) == 0:
-            return [None, T._argmax(node.inputs[0], axis)]
+            return [None, T.Argmax(axis)(node.inputs[0])]
 @register_uncanonicalize