Update. Many of @abergeron comments have been taken account.

Tests have been rewritten, and they run so that a CPU computation and a GPU computation are always performed (separately) on the same input. This allow to run the tests with Theano profiling flags and then compare the execution time of MaxAndArgmax (CPU) and GpuMaxAndArgmax (GPU). Some code has also been modified in theano/tensor/basic.py, related to MaxAndArgmax, to make the API more uniform and to put most of axis checking in maxandargmax wrapper instead of in make_node functions of (Gpu)MaxAndArgmax.

Update. Many of @abergeron comments have been taken account.
0d197386 · notoraptor · 37115ad1 · 0d197386 · 0d197386 · 0d197386
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined, _get_props)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, nerv, extra_ops, multinomial
+from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction

 def transfer(x, target):
    try:

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -65,6 +65,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
+from .reduction import gpu_maxandargmax

 _logger = logging.getLogger("theano.gpuarray.opt")


--- a/theano/gpuarray/GpuMaxAndArgmax.py
+++ b/theano/gpuarray/GpuMaxAndArgmax.py
--- a/theano/gpuarray/tests/test_GpuMaxAndArgmax.py
+++ b/theano/gpuarray/tests/test_GpuMaxAndArgmax.py
 from unittest import TestCase
+
+from theano.gpuarray import GpuArrayType
 from theano.tests import unittest_tools as utt
 import numpy as np
 import theano
 import theano.tensor as T

+from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand_gpuarray
+
+test_shape = (1000, 100, 10, 5, 2)
+

-def randomTensor(*shapes):
+def numpy_random_array(*shapes):
    dimlist = shapes
    size = 1
    for dimsize in dimlist:
        size *= dimsize
-    return np.random.normal(size=size).astype(np.float32).reshape(dimlist)
+    return np.random.normal(size=size).astype(theano.config.floatX).reshape(dimlist)


-def numpyMaxAndArgmax(X, axis=None):
+def numpy_maxandargmax(X, axis=None):
    if axis is None:
        axis = range(X.ndim)
    elif not isinstance(axis, (tuple, list)):
@@ -33,107 +40,93 @@ def numpyMaxAndArgmax(X, axis=None):
    reshaped_x = transposed_x.reshape(new_shape)
    return (ref_max, np.argmax(reshaped_x, axis=-1))

+# We run all tests with 5-D tensors of 10 000 000 elements.
+# NB: In each test, any first call of theano function should be ignored
+# with Theano config flag profiling.ignore_first_call=True.
+
+
+def check_if_gpu_maxandargmax_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
+
+
+def check_if_gpu_maxandargmax_not_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
+
+
+def run_gpu_tensor5(test_matrix=None, axis=None):
+    M = GpuArrayType(dtype=theano.config.floatX, broadcastable=(False,) * 5)()
+    f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)], name='GPU-function', mode=mode_with_gpu)
+    check_if_gpu_maxandargmax_in_graph(f)
+    if test_matrix is None:
+        test_matrix = rand_gpuarray(*test_shape)
+    f(test_matrix)
+    theano_max, theano_argmax = f(test_matrix)
+    ref_max, ref_argmax = numpy_maxandargmax(np.asarray(test_matrix), axis=axis)
+    utt.assert_allclose(ref_max, theano_max)
+    utt.assert_allclose(ref_argmax, theano_argmax)
+
+
+def run_cpu_tensor5(test_matrix=None, axis=None):
+    M = T.tensor5()
+    f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)], name='cpu-function', mode=mode_without_gpu)
+    check_if_gpu_maxandargmax_not_in_graph(f)
+    if test_matrix is None:
+        test_matrix = numpy_random_array(*test_shape)
+    f(test_matrix)
+    theano_max, theano_argmax = f(test_matrix)
+    ref_max, ref_argmax = numpy_maxandargmax(test_matrix, axis=axis)
+    utt.assert_allclose(ref_max, theano_max)
+    utt.assert_allclose(ref_argmax, theano_argmax)
+
+
+def run_tensor5(axis=None):
+    test_cpu_matrix = numpy_random_array(*test_shape)
+    test_gpu_matrix = rand_gpuarray(*test_shape)
+    run_cpu_tensor5(test_cpu_matrix, axis)
+    run_gpu_tensor5(test_gpu_matrix, axis)
+
+
+def test_none():
+    run_tensor5(None)
+
+
+def test_all_axes():
+    run_tensor5((0, 1, 2, 3, 4))
+
+
+def test_all_axes_unsorted():
+    run_tensor5((4, 1, 3, 0, 2))
+
+
+def test_axis_1():
+    run_tensor5(0)
+
+
+def test_axis_2():
+    run_tensor5(1)
+
+
+def test_axis_3():
+    run_tensor5(2)
+
+
+def test_axis_4():
+    run_tensor5(3)
+
+
+def test_axis_5():
+    run_tensor5(4)
+
+
+def test_2_axes():
+    run_tensor5((0, 3))
+
+
+def test_3_axes():
+    run_tensor5((0, 3, 4))
+

-class TestGpuMaxAndArgmax(TestCase):
-    # We run all tests with 5-D tensors of 10 000 000 elements.
-    # NB: In each test, any first call of theano function should be ignored
-    # with Theano config flag profiling.ignore_first_call=True.
-    # To just check if GpuMaxAndArgmax is called:
-    # $ theano-cache purge && THEANO_FLAGS=floatX=float32,device=cuda,profile=True,profiling.ignore_first_call=True \
-    # nosetests --verbose theano/gpuarray/tests/test_GpuMaxAndArgmax.py:TestGpuMaxAndArgmax.test_none
-
-    def _basic_test_tensor5(self, axis=None):
-        M = T.tensor5()
-        max_M = T.max(M, axis=axis)
-        argmax_M = T.argmax(M, axis=axis)
-        f = theano.function([M], [max_M, argmax_M])
-        test_matrix = randomTensor(1000, 100, 10, 5, 2)
-        f(test_matrix)
-        theano_max, theano_argmax = f(test_matrix)
-        ref_max, ref_argmax = numpyMaxAndArgmax(test_matrix, axis=axis)
-        utt.assert_allclose(ref_max, theano_max)
-        utt.assert_allclose(ref_argmax, theano_argmax)
-
-    def _basic_test_assert_equals(self, axis1, axis2):
-        M1 = T.tensor5()
-        M2 = T.tensor5()
-        f1 = theano.function([M1], [T.max(M1, axis=axis1), T.argmax(M1, axis=axis1)])
-        f2 = theano.function([M2], [T.max(M2, axis=axis2), T.argmax(M2, axis=axis2)])
-        test_matrix = randomTensor(1000, 100, 10, 5, 2)
-        f1(test_matrix)
-        f2(test_matrix)
-        theano1 = f1(test_matrix)
-        theano2 = f2(test_matrix)
-        ref1 = numpyMaxAndArgmax(test_matrix, axis1)
-        ref2 = numpyMaxAndArgmax(test_matrix, axis2)
-        utt.assert_allclose(ref1, ref2)
-        utt.assert_allclose(theano1, theano2)
-        utt.assert_allclose(ref1, theano1)
-
-    def test_none(self):
-        self._basic_test_tensor5(None)
-
-    def test_all_axes(self):
-        self._basic_test_tensor5((0, 1, 2, 3, 4))
-
-    def test_1_axe(self):
-        self._basic_test_tensor5(3)
-
-    def test_2_axes(self):
-        self._basic_test_tensor5((0, 3))
-
-    def test_3_axes(self):
-        self._basic_test_tensor5((0, 3, 4))
-
-    def test_4_axes(self):
-        self._basic_test_tensor5((0, 1, 2, 4))
-
-    def test_simple(self):
-        self._basic_test_tensor5(None)
-        self._basic_test_tensor5((0, 1, 2, 3, 4))
-        self._basic_test_tensor5((4, 1, 3, 2))
-
-    def test_assert_equals(self):
-        self._basic_test_assert_equals(None, (0, 1, 2, 3, 4))
-        self._basic_test_assert_equals(0, (0, 0))
-        self._basic_test_assert_equals((4, 1, 3, 2), (1, 2, 3, 4))
-        self._basic_test_assert_equals((4, 3, 2, 1, 0), None)
-        self._basic_test_assert_equals((1, 3, 4), (1, 4, 4, 1, 3, 1, 3, 4, 3, 1, 1, 3, 1, 4, 1, 4))
-
-    def test_simple_1_axis(self):
-        self._basic_test_tensor5(0)
-        self._basic_test_tensor5(1)
-        self._basic_test_tensor5(2)
-        self._basic_test_tensor5(3)
-        self._basic_test_tensor5(4)
-
-    def test_simple_2_axis(self):
-        self._basic_test_tensor5((0, 0))
-        self._basic_test_tensor5((0, 1))
-        self._basic_test_tensor5((0, 2))
-        self._basic_test_tensor5((0, 3))
-        self._basic_test_tensor5((0, 4))
-
-        self._basic_test_tensor5((1, 0))
-        self._basic_test_tensor5((1, 1))
-        self._basic_test_tensor5((1, 2))
-        self._basic_test_tensor5((1, 3))
-        self._basic_test_tensor5((1, 4))
-
-        self._basic_test_tensor5((2, 0))
-        self._basic_test_tensor5((2, 1))
-        self._basic_test_tensor5((2, 2))
-        self._basic_test_tensor5((2, 3))
-        self._basic_test_tensor5((2, 4))
-
-        self._basic_test_tensor5((3, 0))
-        self._basic_test_tensor5((3, 1))
-        self._basic_test_tensor5((3, 2))
-        self._basic_test_tensor5((3, 3))
-        self._basic_test_tensor5((3, 4))
-
-        self._basic_test_tensor5((4, 0))
-        self._basic_test_tensor5((4, 1))
-        self._basic_test_tensor5((4, 2))
-        self._basic_test_tensor5((4, 3))
-        self._basic_test_tensor5((4, 4))
+def test_4_axes():
+    run_tensor5((0, 1, 2, 4))
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1186,55 +1186,28 @@ class MaxAndArgmax(Op):
    def make_node(self, x, axis=None):
        x = _as_tensor_variable(x)

-        if isinstance(axis, (integer_types, numpy.integer)):
-            axis = [int(axis)]
-        elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-            axis = [int(axis)]
-        elif isinstance(axis, (tuple, list, numpy.ndarray)):
-            axis = [int(a) for a in axis]
-            if axis == list(range(x.type.ndim)):
-                axis = None
-        elif isinstance(axis, Variable):
-            if NoneConst.equals(axis):
-                axis = None
-            elif not isinstance(axis, TensorConstant):
-                raise TypeError(
-                    "MaxAndArgmax needs a constant axis. Got %s" % axis)
-            else:
-                assert (axis.dtype.startswith("int") or
-                        axis.dtype.startswith("uint"))
-                if isinstance(axis.data, (integer_types, numpy.integer)) or \
-                   (isinstance(axis.data, numpy.ndarray) and
-                        axis.data.ndim == 0):
-                    axis = [int(axis.data)]
-                elif isinstance(axis.data, (list, numpy.ndarray)):
-                    axis = [int(i) for i in axis.data]
+        if axis is None:
+            axis = range(x.type.ndim)
+        elif not isinstance(axis, list):
+            raise TypeError("Axis must be a list. Got %s" % axis)

        # Make axis entries non-negative, and sort them
-        if isinstance(axis, list):
-            for idx in xrange(len(axis)):
-                if axis[idx] < 0:
-                    axis[idx] += x.type.ndim
-            axis.sort()
+        for idx in xrange(len(axis)):
+            if axis[idx] < 0:
+                axis[idx] += x.type.ndim
+        axis.sort()

        # Verify that axes are valid
        all_axes = []
-        if isinstance(axis, list):
-            for ax in axis:
-                if ax < 0 or ax >= x.type.ndim:
-                    raise ValueError(
-                        'Invalid axis: %s (the number of dimensions of the '
-                        'input is: %s)' % (ax, x.type.ndim))
-                if ax not in all_axes:
-                    all_axes.append(ax)
-        else:
-            all_axes = list(range(x.ndim))
-
-        if axis is None or axis == list(range(x.type.ndim)):
-            axis = NoneConst.clone()
-        else:
-            axis = _as_tensor_variable(all_axes)
-            assert axis.ndim == 1
+        for ax in axis:
+            if ax < 0 or ax >= x.type.ndim:
+                raise ValueError(
+                    'Invalid axis: %s (the number of dimensions of the '
+                    'input is: %s)' % (ax, x.type.ndim))
+            if ax not in all_axes:
+                all_axes.append(ax)
+        axis = _as_tensor_variable(all_axes)
+        assert axis.ndim == 1
        inputs = [x, axis]

        # We keep the original broadcastable flags for dimensions on which
@@ -1272,7 +1245,7 @@ class MaxAndArgmax(Op):
        x, axis = inp
        max, argmax = out
        fail = sub["fail"]
-        if NoneConst.equals(node.inputs[1]):
+        if NoneConst.equals(node.inputs[1]) or len(node.inputs[1].data) == node.inputs[0].ndim:
            axis_code = "axis = NPY_MAXDIMS;"
        else:
            assert node.inputs[1].ndim == 1
@@ -1637,6 +1610,26 @@ def max_and_argmax(a, axis=None, keepdims=False):
        will broadcast correctly against the original tensor.

    """
+    # Check axis and convert it to a Python list of integers.
+    if axis is None:
+        axis = range(a.type.ndim)
+    elif (isinstance(axis, (integer_types, numpy.integer)) or
+            (isinstance(axis, numpy.ndarray) and axis.ndim == 0)):
+        axis = [int(axis)]
+    elif isinstance(axis, (tuple, list, numpy.ndarray)):
+        axis = [int(i) for i in axis]
+    elif isinstance(axis, Variable):
+        if NoneConst.equals(axis):
+            axis = range(a.type.ndim)
+        elif not isinstance(axis, TensorConstant):
+            raise TypeError("max and argmax computation needs a constant axis. Got %s" % axis)
+        else:
+            assert (axis.dtype.startswith("int") or axis.dtype.startswith("uint"))
+            if (isinstance(axis.data, (integer_types, numpy.integer)) or
+                    (isinstance(axis.data, numpy.ndarray) and axis.data.ndim == 0)):
+                axis = [int(axis.data)]
+            elif isinstance(axis.data, (list, numpy.ndarray)):
+                axis = [int(i) for i in axis.data]

    out, argout = _max_and_argmax(a, axis)