New update.

CPU op MaxAndArgmax rewritten so that it now takes the axes as a parameter op. Same update for GPU op GpuMaxAndArgmax. max_and_argmax wrapper rewritten to fully check the axis before passing it to MaxAndArgmax. Some other files have also been updated to ensure that the optimization which replace CPU op by GPU op works well and that all tests involving MaxAndArgmax still work well after the updates. GPU op rewritten to handle the last libgpuarray update. test_reduction rewritten. It now tests also 0-d arrays (scalars). I have run the following tests with success. The new update of libgpuarray has been downloaded, compiled and fully installed before running these tests. nosetests -v theano/gpuarray/tests/test_opt.py # There is 1 fail here, but not related to MaxAndArgmax: # ERROR: theano.gpuarray.tests.test_opt.test_local_lift_abstractconv_gpu_shape # RuntimeError: cuDNN is required for convolution and pooling nosetests -v theano/tensor/nnet/tests/test_nnet.py nosetests -v theano/tensor/tests/test_opt_uncanonicalize.py nosetests -v theano/tensor/tests/test_basic.py THEANO_FLAGS=floatX=float32,profile=True,profiling.n_ops=1000,profiling.n_apply=1000,profiling.ignore_first_call=True,profiling.destination=profiling.log nosetests --nocapture --verbose theano/gpuarray/tests/test_reduction.py Prevent Flake8!

New update.
b2ae1db6 · notoraptor · c2835d19 · b2ae1db6 · b2ae1db6 · c2835d19
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -65,7 +65,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
-from .reduction import gpu_maxandargmax
+from .reduction import GpuMaxAndArgmax

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -1782,7 +1782,7 @@ def _scan_type_infer(node):
 @op_lifter([tensor.MaxAndArgmax])
 @register_opt2([tensor.MaxAndArgmax], 'fast_compile')
 def local_gpu_maxandargmax(op, context_name, inputs, outputs):
-    return gpu_maxandargmax
+    return GpuMaxAndArgmax(op.get_params(None))

 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.

--- a/theano/gpuarray/reduction.py
+++ b/theano/gpuarray/reduction.py
+from __future__ import print_function, absolute_import, division
 import os
-import numpy
 import theano
 from theano.gof import Op, Apply
-from theano.tensor.var import TensorConstant
+from theano.gof.type import Generic

 from .basic_ops import (infer_context_name, as_gpuarray_variable)
 from .type import GpuArrayType
@@ -18,57 +18,29 @@ class GpuMaxAndArgmax(Op):
    GPU version of MaxAndArgmax

    """
-    __props__ = ()
+    params_type = Generic()
+    __props__ = ('axis',)
    argmax_dtype = "int64"

-    def make_node(self, X, axis=None):
+    def __init__(self, axis):
+        assert isinstance(axis, (list, tuple))
+        self.axis = tuple(axis)
+
+    def get_params(self, node):
+        return self.axis
+
+    def make_node(self, X):
        context_name = infer_context_name(X)
-        if axis is None:
-            axis = range(X.type.ndim)
-        elif isinstance(axis, TensorConstant) and isinstance(axis.data, (list, numpy.ndarray)):
-            axis = [int(i) for i in axis.data]
-        elif not isinstance(axis, list):
-            raise TypeError("Axis must be a list. Got %s" % axis)
-        # Make axis entries non-negative, and verify that axes are valid.
-        for idx in xrange(len(axis)):
-            if axis[idx] < 0:
-                axis[idx] += X.type.ndim
-            if axis[idx] < 0 or axis[idx] >= X.type.ndim:
-                raise ValueError('Invalid axis: %s (the number of dimensions of the '
-                                 'input is: %s)' % (axis[idx], X.type.ndim))
-        # Sort axes and make them unique.
-        axis_set = set(axis)  # used to build "broadcastable" variable below.
-        axis = list(axis_set)
-        axis.sort()
-        axis = theano.tensor.as_tensor_variable(axis)
-        inputs = [as_gpuarray_variable(X, context_name), axis]
        # We keep the original broadcastable flags for dimensions on which
        # we do not perform the max / argmax.
+        all_axes = set(self.axis)
        broadcastable = [b for i, b in enumerate(X.type.broadcastable)
-                         if i not in axis_set]
+                         if i not in all_axes]
+        inputs = [as_gpuarray_variable(X, context_name)]
        outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
                   GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
        return Apply(self, inputs, outputs)

-    def perform(self, node, inputs, outputs):
-        # NB: I must rewrite this method with pygpu functions instead of numpy functions.
-        x, axes = inputs
-        max, max_idx = outputs
-        X = numpy.asarray(x)
-        axes = tuple(axes)
-        max[0] = theano._asarray(numpy.max(X, axes), dtype=node.outputs[0].dtype)
-        # Numpy does not support multiple axes for argmax
-        # Work around
-        keep_axes = numpy.array([i for i in range(X.ndim) if i not in axes], dtype='int64')
-        # Not-reduced axes in front
-        transposed_x = numpy.transpose(X, numpy.concatenate((keep_axes, axes)))
-        kept_shape = transposed_x.shape[:len(keep_axes)]
-        reduced_shape = transposed_x.shape[len(keep_axes):]
-        new_shape = kept_shape + (numpy.prod(reduced_shape),)
-        reshaped_x = transposed_x.reshape(new_shape)
-
-        max_idx[0] = theano._asarray(numpy.argmax(reshaped_x, axis=-1), dtype='int64')
-
    def c_headers(self):
        return ['<numpy_compat.h>', '<gpuarray_helper.h>']

@@ -76,75 +48,71 @@ class GpuMaxAndArgmax(Op):
        return [pygpu.get_include(), os.path.dirname(__file__)]

    def c_code(self, node, name, input_names, output_names, sub):
-        # Recall: X, axes = input_names
+        # Recall: X = input_names[0]
+        # Recall: axes = sub['params']
        # Recall: max, argmax = output_names
        # Recall: fail = sub['fail']
        max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
-        axes_ctype = 'int64_t'
-        assert node.inputs[1].ndim == 1
        ret = """
-        GpuArray temp;
-        GpuArray* %(name)s_input = &%(X)s->ga;
-        size_t %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
-
-        unsigned  %(name)s_redux_len = PyArray_DIM(%(axes)s, 0);
+        unsigned  %(name)s_redux_len = PyTuple_GET_SIZE(%(axes)s);
        unsigned* %(name)s_axes_to_reduce = (unsigned*)malloc(%(name)s_redux_len * sizeof(unsigned));
        for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
-            %(name)s_axes_to_reduce[i] = (unsigned) (*(%(axes_ctype)s*)PyArray_GETPTR1(%(axes)s, i));
+            PyObject* axis_object = PyTuple_GET_ITEM(%(axes)s, i);
+            %(name)s_axes_to_reduce[i] = (unsigned) PyInt_AS_LONG(axis_object);
+            Py_XDECREF(axis_object);
        }

+        size_t  %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
        size_t  %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
-        size_t* %(name)s_output_dims = NULL;
-        if (%(name)s_output_ndim == 0) {
-            /* Current backend function GpuArray_maxandargmax does not work when
-             * all axes need to be reduced. So to handle this case, we create a view
-             * of the input as a matrix with 1 row and as many columns as elements
-             * in the input, so that the 2nd dimenson of the matrix will be reduced. */
-            size_t total_size = 1;
-            for (size_t i = 0; i < %(name)s_input_ndim; ++i) {
-                total_size *= PyGpuArray_DIM(%(X)s, i);
+        size_t* %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
+        if (%(name)s_redux_len == 1) {
+            for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
+                %(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
            }
-            size_t newdims[2] = {1, total_size};
-            %(name)s_input = &temp;
-            if (GA_NO_ERROR !=
-                GpuArray_reshape(%(name)s_input, &%(X)s->ga, 2, newdims, GA_ANY_ORDER, 0)
-            ) {
-                %(fail)s
+            for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
+                %(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
            }
-            %(name)s_redux_len = 1;
-            %(name)s_axes_to_reduce[0] = 1;
        } else {
-            %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
-            if (%(name)s_redux_len == 1) {
-                for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
-                    %(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
-                }
-                for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
-                    %(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
-                }
-            } else {
-                int64_t current_input_pos = -1;
-                int64_t current_output_pos = -1;
-                for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
-                    for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
-                        %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
-                    }
-                }
-                for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
+            int64_t current_input_pos = -1;
+            int64_t current_output_pos = -1;
+            for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
+                for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
                    %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
                }
            }
+            for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
+                %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
+            }
        }
+
        if (theano_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare max output.");
            %(fail)s
        }
        if (theano_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare argmax output.");
            %(fail)s
        }
-        if (GA_NO_ERROR !=
-            GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, %(name)s_input, %(name)s_redux_len, %(name)s_axes_to_reduce)
+
+        if (%(name)s_input_ndim == 0) {
+            /* GpuArray_maxandargmax can't handle a 0-d array
+             * because it expects that 1 <= redux_len <= input_ndim.
+             * As input_ndim == 0, then 1 <= redux_len <= 0 is false.
+             * To handle this case we copy input to max and we set argmax to 0.
+             */
+            if (GA_NO_ERROR != GpuArray_setarray(&%(max)s->ga, &%(X)s->ga)) {
+                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to copy input to max when input is a scalar.");
+                %(fail)s
+            }
+            if (GA_NO_ERROR != GpuArray_memset(&%(argmax)s->ga, 0)) {
+                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to set argmax to 0 when input is a scalar.");
+                %(fail)s
+            }
+        } else if (GA_NO_ERROR !=
+            GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, &%(X)s->ga, %(name)s_redux_len, %(name)s_axes_to_reduce)
        ) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to compute gpuarray maxandargmax.");
            %(fail)s
        }
        """
@@ -153,8 +121,8 @@ class GpuMaxAndArgmax(Op):
            GpuArray_sync(&%(max)s->ga);
            GpuArray_sync(&%(argmax)s->ga);
            """
-        return ret % {'X': input_names[0], 'axes': input_names[1], 'max': output_names[0], 'argmax': output_names[1],
-                      'axes_ctype': axes_ctype, 'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
+        return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
+                      'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
                      'name': name, 'fail': sub['fail']}

    def c_code_cleanup(self, node, name, inputs, outputs, sub):
@@ -162,6 +130,3 @@ class GpuMaxAndArgmax(Op):
        free(%(name)s_output_dims);
        free(%(name)s_axes_to_reduce);
        """ % {'name': name, 'X': inputs[0]}
-
-
-gpu_maxandargmax = GpuMaxAndArgmax()
--- a/theano/gpuarray/tests/test_GpuMaxAndArgmax.py
+++ b/theano/gpuarray/tests/test_GpuMaxAndArgmax.py
-from theano.gpuarray import GpuArrayType
-from theano.tests import unittest_tools as utt
-import numpy as np
-import theano
-import theano.tensor as T
-
-from .config import mode_with_gpu, mode_without_gpu
-from .test_basic_ops import rand_gpuarray
-
-test_shape = (1000, 100, 10, 5, 2)
-
-
-def numpy_random_array(*shapes):
-    dimlist = shapes
-    size = 1
-    for dimsize in dimlist:
-        size *= dimsize
-    return np.random.normal(size=size).astype(theano.config.floatX).reshape(dimlist)
-
-
-def numpy_maxandargmax(X, axis=None):
-    if axis is None:
-        axis = range(X.ndim)
-    elif not isinstance(axis, (tuple, list)):
-        axis = [axis]
-    axis = list(set(axis))  # remove duplicated values.
-    axis.sort()
-    axis = tuple(axis)
-    ref_max = np.max(X, axis=axis)
-    # Numpy does not support multiple axes for argmax. Work around
-    # Code copied from MaxAndArgmax.perform()
-    keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype='int64')
-    # Not-reduced axes in front
-    transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
-    kept_shape = transposed_x.shape[:len(keep_axes)]
-    reduced_shape = transposed_x.shape[len(keep_axes):]
-    new_shape = kept_shape + (np.prod(reduced_shape),)
-    reshaped_x = transposed_x.reshape(new_shape)
-    return (ref_max, np.argmax(reshaped_x, axis=-1))
-
-# We run all tests with 5-D tensors of 10 000 000 elements.
-# NB: In each test, any first call of theano function should be ignored
-# with Theano config flag profiling.ignore_first_call=True.
-
-
-def check_if_gpu_maxandargmax_in_graph(theano_function):
-    assert len([node for node in theano_function.maker.fgraph.apply_nodes
-                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
-
-
-def check_if_gpu_maxandargmax_not_in_graph(theano_function):
-    assert len([node for node in theano_function.maker.fgraph.apply_nodes
-                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
-
-
-def run_gpu_tensor5(test_matrix=None, axis=None):
-    M = GpuArrayType(dtype=theano.config.floatX, broadcastable=(False,) * 5)()
-    f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)], name='GPU-function', mode=mode_with_gpu)
-    check_if_gpu_maxandargmax_in_graph(f)
-    if test_matrix is None:
-        test_matrix = rand_gpuarray(*test_shape)
-    f(test_matrix)
-    theano_max, theano_argmax = f(test_matrix)
-    ref_max, ref_argmax = numpy_maxandargmax(np.asarray(test_matrix), axis=axis)
-    utt.assert_allclose(ref_max, theano_max)
-    utt.assert_allclose(ref_argmax, theano_argmax)
-
-
-def run_cpu_tensor5(test_matrix=None, axis=None):
-    M = T.tensor5()
-    f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)], name='cpu-function', mode=mode_without_gpu)
-    check_if_gpu_maxandargmax_not_in_graph(f)
-    if test_matrix is None:
-        test_matrix = numpy_random_array(*test_shape)
-    f(test_matrix)
-    theano_max, theano_argmax = f(test_matrix)
-    ref_max, ref_argmax = numpy_maxandargmax(test_matrix, axis=axis)
-    utt.assert_allclose(ref_max, theano_max)
-    utt.assert_allclose(ref_argmax, theano_argmax)
-
-
-def run_tensor5(axis=None):
-    test_cpu_matrix = numpy_random_array(*test_shape)
-    test_gpu_matrix = rand_gpuarray(*test_shape)
-    run_cpu_tensor5(test_cpu_matrix, axis)
-    run_gpu_tensor5(test_gpu_matrix, axis)
-
-
-def test_none():
-    run_tensor5(None)
-
-
-def test_all_axes():
-    run_tensor5((0, 1, 2, 3, 4))
-
-
-def test_all_axes_unsorted():
-    run_tensor5((4, 1, 3, 0, 2))
-
-
-def test_axis_1():
-    run_tensor5(0)
-
-
-def test_axis_2():
-    run_tensor5(1)
-
-
-def test_axis_3():
-    run_tensor5(2)
-
-
-def test_axis_4():
-    run_tensor5(3)
-
-
-def test_axis_5():
-    run_tensor5(4)
-
-
-def test_2_axes():
-    run_tensor5((0, 3))
-
-
-def test_3_axes():
-    run_tensor5((0, 3, 4))
-
-
-def test_4_axes():
-    run_tensor5((0, 1, 2, 4))
--- a/theano/gpuarray/tests/test_reduction.py
+++ b/theano/gpuarray/tests/test_reduction.py
+from __future__ import print_function, absolute_import, division
+from unittest import TestCase
+import numpy as np
+
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tests.unittest_tools import SkipTest
+
+from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand_gpuarray
+from .. import GpuArrayType
+
+test_shape = (1000, 100, 10, 5, 2)
+# NB: This order of "unsorted axes" is arbitrary and is here
+# just to have the same informations on profile output
+# from one test to another.
+unsorted_axes = (2, 4, 0, 3, 1)
+
+np.random.seed()
+
+
+def numpy_random_array(shapes):
+    size = 1
+    for dimsize in shapes:
+        size *= dimsize
+    return np.random.normal(size=size).astype(theano.config.floatX).reshape(shapes)
+
+
+def numpy_maxandargmax(X, axis=None):
+    if axis is None:
+        axis = range(X.ndim)
+    elif not isinstance(axis, (tuple, list)):
+        axis = [int(axis)]
+    axis = list(set(axis))  # remove duplicated values.
+    axis.sort()
+    axis = tuple(axis)
+    ref_max = np.max(X, axis=axis)
+    # Following code is copied from MaxAndArgmax.perform():
+    # Numpy does not support multiple axes for argmax. Work around.
+    keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype='int64')
+    # Not-reduced axes in front
+    transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
+    kept_shape = transposed_x.shape[:len(keep_axes)]
+    reduced_shape = transposed_x.shape[len(keep_axes):]
+    new_shape = kept_shape + (np.prod(reduced_shape),)
+    new_shape = tuple(int(i) for i in new_shape)
+    reshaped_x = transposed_x.reshape(new_shape)
+    return (ref_max, np.argmax(reshaped_x, axis=-1))
+
+
+def check_if_gpu_maxandargmax_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
+
+
+def check_if_gpu_maxandargmax_not_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
+
+
+class BaseTest:
+    # This attribute must be set in subclasses.
+    tensor_size = None
+
+    dtype = theano.config.floatX
+
+    def setUp(self):
+        if not isinstance(self.tensor_size, int):
+            raise SkipTest("No tensor ndim defined.")
+        if self.tensor_size < 0 or self.tensor_size > 5:
+            raise SkipTest("We allow from 0 (included) to 5 (inclued) dimensons for these tests.")
+
+    def get_host_tensor(self):
+        broadcastable = (False,) * self.tensor_size
+        return T.tensor(self.dtype, broadcastable)
+
+    def get_gpu_tensor(self):
+        broadcastable = (False,) * self.tensor_size
+        return GpuArrayType(self.dtype, broadcastable)()
+
+    def get_host_value(self):
+        return numpy_random_array(test_shape[:self.tensor_size])
+
+    def get_gpu_value(self):
+        return rand_gpuarray(*(test_shape[:self.tensor_size]))
+
+    # NB: In compute_host() and compute_gpu(),
+    # the first call of the theano function should be ignored in profiling,
+    # with Theano config flag profiling.ignore_first_call=True.
+
+    def compute_host(self, test_tensor, axis):
+        M = self.get_host_tensor()
+        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
+                            name='HOST-function', mode=mode_without_gpu)
+        check_if_gpu_maxandargmax_not_in_graph(f)
+        f(test_tensor)
+        theano_max, theano_argmax = f(test_tensor)
+        ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
+        utt.assert_allclose(ref_max, theano_max)
+        utt.assert_allclose(ref_argmax, theano_argmax)
+
+    def compute_gpu(self, test_gpu_tensor, test_host_tensor, axis):
+        M = self.get_gpu_tensor()
+        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
+                            name='GPU-function', mode=mode_with_gpu)
+        check_if_gpu_maxandargmax_in_graph(f)
+        f(test_gpu_tensor)
+        theano_max, theano_argmax = f(test_gpu_tensor)
+        ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
+        utt.assert_allclose(ref_max, theano_max)
+        utt.assert_allclose(ref_argmax, theano_argmax)
+
+    def compute(self, axis=None):
+        # We want to run CPU op and GPU op on the same tensor randomly generated.
+        test_gpu_tensor = self.get_gpu_value()
+        test_host_tensor = np.asarray(test_gpu_tensor)
+        self.compute_host(test_host_tensor, axis)
+        self.compute_gpu(test_gpu_tensor, test_host_tensor, axis)
+
+    def compute_axis(self, pos):
+        if 0 <= pos < self.tensor_size:
+            self.compute(pos)
+
+    def compute_some_axes(self, count):
+        if 0 <= count <= self.tensor_size:
+            self.compute([i for i in unsorted_axes if i < self.tensor_size][:count])
+
+    def test_none(self):
+        self.compute(None)
+
+    def test_all_axes(self):
+        self.compute(range(self.tensor_size))
+
+    def test_all_axes_unsorted(self):
+        self.compute([i for i in unsorted_axes if i < self.tensor_size])
+
+    def test_axis_1(self):
+        self.compute_axis(0)
+
+    def test_axis_2(self):
+        self.compute_axis(1)
+
+    def test_axis_3(self):
+        self.compute_axis(2)
+
+    def test_axis_4(self):
+        self.compute_axis(3)
+
+    def test_axis_5(self):
+        self.compute_axis(4)
+
+    # For the tests below, we expect CPU op to run with Python implementation.
+
+    def test_2_axes(self):
+        self.compute_some_axes(2)
+
+    def test_3_axes(self):
+        self.compute_some_axes(3)
+
+    def test_4_axes(self):
+        self.compute_some_axes(4)
+
+
+class TestScalar(BaseTest, TestCase):
+    tensor_size = 0
+
+
+class TestVector(BaseTest, TestCase):
+    tensor_size = 1
+
+
+class TestMatrix(BaseTest, TestCase):
+    tensor_size = 2
+
+
+class TestTensor5(BaseTest, TestCase):
+    tensor_size = 5
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -15,6 +15,7 @@ from theano.compat import izip
 from theano.configparser import config
 from theano import gof
 from theano.gof import Apply, Constant, Op, Variable
+from theano.gof.type import Generic

 from theano.tensor import elemwise
 from theano.tensor.var import (AsTensorError, TensorVariable,
@@ -1181,45 +1182,32 @@ class MaxAndArgmax(Op):
    nin = 2  # tensor, axis
    nout = 2  # max val, max idx
    E_axis = 'invalid axis'
-    __props__ = ()
-
-    def make_node(self, x, axis=None):
-        x = _as_tensor_variable(x)
+    params_type = Generic()
+    __props__ = ('axis',)

-        if axis is None:
-            axis = range(x.type.ndim)
-        elif not isinstance(axis, list):
-            raise TypeError("Axis must be a list. Got %s" % axis)
+    def __init__(self, axis):
+        assert isinstance(axis, list)
+        self.axis = tuple(axis)

-        # Make axis entries non-negative, and sort them
-        for idx in xrange(len(axis)):
-            if axis[idx] < 0:
-                axis[idx] += x.type.ndim
-        axis.sort()
+    def get_params(self, node):
+        return self.axis

-        # Verify that axes are valid
-        all_axes = []
-        for ax in axis:
-            if ax < 0 or ax >= x.type.ndim:
-                raise ValueError(
-                    'Invalid axis: %s (the number of dimensions of the '
-                    'input is: %s)' % (ax, x.type.ndim))
-            if ax not in all_axes:
-                all_axes.append(ax)
-        axis = _as_tensor_variable(all_axes)
-        assert axis.ndim == 1
-        inputs = [x, axis]
+    def make_node(self, x):
+        x = _as_tensor_variable(x)

        # We keep the original broadcastable flags for dimensions on which
        # we do not perform the max / argmax.
+        all_axes = set(self.axis)
        broadcastable = [b for i, b in enumerate(x.type.broadcastable)
                         if i not in all_axes]
+        inputs = [x]
        outputs = [tensor(x.type.dtype, broadcastable, name='max'),
                   tensor('int64', broadcastable, name='argmax')]
        return Apply(self, inputs, outputs)

-    def perform(self, node, inp, outs):
-        x, axes = inp
+    def perform(self, node, inp, outs, params):
+        x = inp[0]
+        axes = params
        max, max_idx = outs
        if axes is None:
            axes = tuple(range(x.ndim))
@@ -1242,35 +1230,40 @@ class MaxAndArgmax(Op):
                                     dtype='int64')

    def c_code(self, node, name, inp, out, sub):
-        x, axis = inp
+        if len(self.axis) != 1 and len(self.axis) != node.inputs[0].ndim:
+            raise NotImplementedError("NumPy C-API can compute max and argmax only for 1 axis or for all axes.")
+        x = inp[0]
+        axis = sub['params']
        max, argmax = out
        fail = sub["fail"]
-        if NoneConst.equals(node.inputs[1]) or len(node.inputs[1].data) == node.inputs[0].ndim:
-            axis_code = "axis = NPY_MAXDIMS;"
-        else:
-            assert node.inputs[1].ndim == 1
-            # Fall back to perform() if there are multiple axes
-            if len(node.inputs[1].data) > 1:
-                raise NotImplementedError()
-            axis_code = """
-            axis = ((dtype_%(axis)s*)PyArray_DATA(%(axis)s))[0];
-            if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){
+        ret = """
+        int axis;
+
+        if (PyTuple_GET_SIZE(%(axis)s) == PyArray_NDIM(%(x)s)) {
+            axis = NPY_MAXDIMS;
+        } else if(PyTuple_GET_SIZE(%(axis)s) == 1) {
+            PyObject* axis_object = PyTuple_GET_ITEM(%(axis)s, 0);
+            axis = (int)PyInt_AS_LONG(axis_object);
+            Py_XDECREF(axis_object);
+            if (axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)) {
                PyErr_SetString(PyExc_ValueError,
-                "MaxAndArgmax, bad axis argument");
+                "MaxAndArgmax: bad axis argument");
                %(fail)s
            }
-            """ % locals()
-        ret = """
-        int axis;
+        } else {
+            PyErr_SetString(PyExc_NotImplementedError,
+            "MaxAndArgmax: NumPy C-API can compute max and argmax only for 1 axis or for all axes.");
+            %(fail)s
+        }

        Py_CLEAR(%(max)s);
        Py_CLEAR(%(argmax)s);//todo pass them as out parameter.
-        %(axis_code)s
+
        %(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL);
-        if(%(max)s == NULL){
+        if (%(max)s == NULL) {
            %(fail)s;
        }
-        if(!PyArray_CheckExact(%(max)s)){
+        if (!PyArray_CheckExact(%(max)s)) {
            %(max)s = (PyArrayObject*)PyArray_FromAny((PyObject*)%(max)s, NULL, 0, 0, NPY_ARRAY_ENSUREARRAY, NULL);
            if(%(max)s == NULL){
                %(fail)s;
@@ -1278,17 +1271,17 @@ class MaxAndArgmax(Op):
        }

        %(argmax)s = (PyArrayObject*)PyArray_ArgMax(%(x)s, axis, NULL);
-        if(%(argmax)s == NULL){
+        if (%(argmax)s == NULL) {
            Py_CLEAR(%(max)s);
            %(fail)s;
        }
-        if(!PyArray_CheckExact(%(argmax)s)){
+        if (!PyArray_CheckExact(%(argmax)s)) {
            %(argmax)s = (PyArrayObject*)PyArray_FromAny((PyObject*)%(argmax)s, NULL, 0, 0, NPY_ARRAY_ENSUREARRAY, NULL);
            if(%(argmax)s == NULL){
                %(fail)s;
            }
        }
-        if(PyArray_TYPE(%(argmax)s) != NPY_INT64){
+        if (PyArray_TYPE(%(argmax)s) != NPY_INT64) {
            PyObject * tmp = PyArray_Cast(%(argmax)s, NPY_INT64);
            if (NULL == tmp){
                %(fail)s;
@@ -1303,28 +1296,25 @@ class MaxAndArgmax(Op):
        return (4,)

    def infer_shape(self, node, shapes):
-        ishape, axis_shape = shapes
-        axis = node.inputs[1]
-        if axis.data is None:
-            return [(), ()]
-        rval = tuple([ishape[i] for (i, b) in enumerate(
-            node.inputs[0].type.broadcastable) if i not in axis.data])
+        ishape = shapes[0]
+        rval = tuple(ishape[i] for (i, b) in enumerate(
+            node.inputs[0].type.broadcastable) if i not in self.axis)
        return [rval, rval]

    def R_op(self, inputs, eval_points):
        if eval_points[0] is None:
            return [None, None]
-        if not isinstance(inputs[1], theano.Constant):
+        if len(self.axis) != 1:
            raise ValueError(('R_op supported for arg_max only for '
                              'constant axis!'))
-        if inputs[1].data > 1:
+        if self.axis[0] > 1:
            raise ValueError(('R_op supported for arg_max only when '
                              ' axis is 0 or 1'))
        if inputs[0].ndim != 2:
            raise ValueError(('R_op supported for arg_max only when '
                              ' input is a matrix'))
        max_vals, max_pos = self.make_node(*inputs).outputs
-        if inputs[1].data == 0:
+        if self.axis[0] == 0:
            return [eval_points[0][max_pos,
                                   arange(eval_points[0].shape[1])], None]
        else:
@@ -1345,7 +1335,8 @@ class MaxAndArgmax(Op):
        # g_max has one less dimension than x, so you need to complete
        # g_max to x's shape when axis=0 the broadcasting mechanism
        # does it automatically
-        x, axis = inp
+        x = inp[0]
+        axis = _as_tensor_variable(self.axis)
        g_max, g_max_idx = grads

        g_max_disconnected = isinstance(g_max.type, DisconnectedType)
@@ -1363,7 +1354,7 @@ class MaxAndArgmax(Op):
        # if the max is disconnected but the argmax is not,
        # the gradient on its inputs is zero
        if g_max_disconnected:
-            return [x.zeros_like(), axis_grad]
+            return [x.zeros_like()]
        if NoneConst.equals(axis):
            axis_ = list(range(x.ndim))
        else:
@@ -1387,9 +1378,7 @@ class MaxAndArgmax(Op):

        # Set the grad to the correct position.
        g_x = eq(xmax_pad, x) * g_max_pad
-        return g_x, axis_grad
-
-_max_and_argmax = MaxAndArgmax()
+        return g_x,


 class Argmax(Op):
@@ -1611,6 +1600,7 @@ def max_and_argmax(a, axis=None, keepdims=False):

    """
    # Check axis and convert it to a Python list of integers.
+    # Axis will be used as an op param of MaxAndArgmax.
    if axis is None:
        axis = range(a.type.ndim)
    elif (isinstance(axis, (integer_types, numpy.integer)) or
@@ -1630,8 +1620,18 @@ def max_and_argmax(a, axis=None, keepdims=False):
                axis = [int(axis.data)]
            elif isinstance(axis.data, (list, numpy.ndarray)):
                axis = [int(i) for i in axis.data]
-
-    out, argout = _max_and_argmax(a, axis)
+    if len(axis) == 0:
+        axis = range(a.type.ndim)
+    else:
+        for i in range(len(axis)):
+            if axis[i] < 0:
+                axis[i] += a.type.ndim
+            if axis[i] < 0 or axis[i] >= a.type.ndim:
+                raise ValueError("max and argmax computation needs a valid axis number for %d-D tensor. Got %d"
+                                 % (a.type.ndim, axis[i]))
+        axis = list(set(axis))
+        axis.sort()
+    out, argout = MaxAndArgmax(axis)(a)

    if keepdims:
        out = makeKeepDims(a, out, axis)

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1568,9 +1568,9 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):


 @opt.register_specialize('fast_compile_gpu')
-@gof.local_optimizer([tensor._max_and_argmax])
+@gof.local_optimizer([tensor.MaxAndArgmax])
 def local_argmax_pushdown(node):
-    if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
+    if isinstance(node.op, tensor.MaxAndArgmax) and node.inputs[0].owner and \
            len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \
            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
@@ -1584,20 +1584,21 @@ def local_argmax_pushdown(node):
                "warning set the Theano flags 'warn.argmax_pushdown_bug' "
                "to False")

-    if (node.op == tensor._max_and_argmax and
+    if (isinstance(node.op, tensor.MaxAndArgmax) and
            node.inputs[0].owner and len(node.outputs[0].clients) == 0):
        x_max, x_argmax = node.outputs
-        x, axis = node.inputs
+        x = node.inputs[0]
+        axis = node.op.get_params(node)
        # TODO: Make a list/set of monotonic ops...
        if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
                                      tensor.log, tensor.tanh, sigmoid):
            pre_x, = x.owner.inputs
-            ret = tensor._max_and_argmax(pre_x, axis)
+            ret = tensor.max_and_argmax(pre_x, axis)
            copy_stack_trace(x_max, ret)
            return ret
        if x.owner and x.owner.op == softmax_with_bias:
            pre_x, pre_bias = x.owner.inputs
-            ret = tensor._max_and_argmax(pre_x +
+            ret = tensor.max_and_argmax(pre_x +
                                         tensor.DimShuffle(
                                             pre_bias.broadcastable,
                                             ('x', 0))(pre_bias), axis)

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -41,8 +41,6 @@ from theano.tensor.elemwise import CAReduce
 from theano.tensor import basic as T
 from theano.tensor import DimShuffle

-from theano.tensor.basic import (get_scalar_constant_value,
-                                 NotScalarConstantError)
 from theano.tensor.opt import register_uncanonicalize
 from theano import scalar as scal

@@ -50,25 +48,18 @@ _logger = logging.getLogger('theano.tensor.opt')


 @register_uncanonicalize
-@gof.local_optimizer([T._max_and_argmax])
+@gof.local_optimizer([T.MaxAndArgmax])
 def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
-    if node.op == T._max_and_argmax:
+    if isinstance(node.op, T.MaxAndArgmax):
        if len(node.outputs[1].clients) == 0:
            # MaxAndArgmax support variable axis,
            # but CAReduce support only constant axis.
-            if node.inputs[1].data is None:
-                axis = None
-            else:
-                try:
-                    axis = get_scalar_constant_value(node.inputs[1])
-                except NotScalarConstantError:
-                    axis = node.inputs[1]
-                    if not isinstance(axis, T.TensorConstant):
-                        return False
-                    axis = axis.data
+            axis = node.op.get_params(node)
+            if len(axis) != 1:
+                return False

            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -7619,23 +7619,23 @@ class TestInferShape(utt.InferShapeTester):
        # MaxAndArgmax,
        adtens3_val = rand(4, 5, 3)
        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, None),
+                max_and_argmax(adtens3, None),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 0),
+                max_and_argmax(adtens3, 0),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 1),
+                max_and_argmax(adtens3, 1),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 2),
+                max_and_argmax(adtens3, 2),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, [0, 1, 2]),
+                max_and_argmax(adtens3, [0, 1, 2]),
                [adtens3_val], MaxAndArgmax)

        # ARange