Merge pull request #5168 from notoraptor/master

This is my proposal for GpuMaxAndArgmax (issue #1399).

Merge pull request #5168 from notoraptor/master
3347480a · Pascal Lamblin · GitHub · 86fdfdaf · 0f4436f9 · 3347480a
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined, _get_props)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, nerv, extra_ops, multinomial
+from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction

 def transfer(x, target):
    try:

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -65,6 +65,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
+from .reduction import GpuMaxAndArgmax

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -1775,6 +1776,14 @@ def _scan_type_infer(node):
                            context_name=context_name)
    return typebuild

+
+# Add optimization : maxandargmax (CPU -> GPU)
+@register_opt('fast_compile')
+@op_lifter([tensor.MaxAndArgmax])
+@register_opt2([tensor.MaxAndArgmax], 'fast_compile')
+def local_gpu_maxandargmax(op, context_name, inputs, outputs):
+    return GpuMaxAndArgmax(op.get_params(None))
+
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/reduction.py
+++ b/theano/gpuarray/reduction.py
+from __future__ import print_function, absolute_import, division
+import os
+import theano
+from theano.gof import Op, Apply
+from theano.gof.type import Generic
+
+from .basic_ops import (infer_context_name, as_gpuarray_variable)
+from .type import GpuArrayType
+
+try:
+    import pygpu
+except ImportError as e:
+    pass
+
+
+class GpuMaxAndArgmax(Op):
+    """
+    GPU version of MaxAndArgmax
+
+    """
+    params_type = Generic()
+    __props__ = ('axis',)
+    argmax_dtype = "int64"
+
+    def __init__(self, axis):
+        assert isinstance(axis, (list, tuple))
+        self.axis = tuple(axis)
+
+    def get_params(self, node):
+        return self.axis
+
+    def make_node(self, X):
+        context_name = infer_context_name(X)
+        # We keep the original broadcastable flags for dimensions on which
+        # we do not perform the max / argmax.
+        all_axes = set(self.axis)
+        broadcastable = [b for i, b in enumerate(X.type.broadcastable)
+                         if i not in all_axes]
+        inputs = [as_gpuarray_variable(X, context_name)]
+        outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
+                   GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
+        return Apply(self, inputs, outputs)
+
+    def c_headers(self):
+        return ['<numpy_compat.h>', '<gpuarray_helper.h>']
+
+    def c_header_dirs(self):
+        return [pygpu.get_include(), os.path.dirname(__file__)]
+
+    def c_code(self, node, name, input_names, output_names, sub):
+        # Recall: X = input_names[0]
+        # Recall: axes = sub['params']
+        # Recall: max, argmax = output_names
+        # Recall: fail = sub['fail']
+        max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
+        ret = """
+        #if PY_MAJOR_VERSION >= 3
+            #ifndef PyInt_AS_LONG
+                #define PyInt_AS_LONG PyLong_AS_LONG
+            #endif
+        #endif
+
+        unsigned  %(name)s_redux_len = PyTuple_GET_SIZE(%(axes)s);
+        unsigned* %(name)s_axes_to_reduce = (unsigned*)malloc(%(name)s_redux_len * sizeof(unsigned));
+        for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
+            PyObject* axis_object = PyTuple_GET_ITEM(%(axes)s, i);
+            %(name)s_axes_to_reduce[i] = (unsigned) PyInt_AS_LONG(axis_object);
+            Py_XDECREF(axis_object);
+        }
+
+        size_t  %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
+        size_t  %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
+        size_t* %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
+        if (%(name)s_redux_len == 1) {
+            for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
+                %(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
+            }
+            for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
+                %(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
+            }
+        } else {
+            int64_t current_input_pos = -1;
+            int64_t current_output_pos = -1;
+            for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
+                for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
+                    %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
+                }
+            }
+            for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
+                %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
+            }
+        }
+
+        if (theano_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare max output.");
+            %(fail)s
+        }
+        if (theano_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare argmax output.");
+            %(fail)s
+        }
+
+        if (%(name)s_input_ndim == 0) {
+            /* GpuArray_maxandargmax can't handle a 0-d array
+             * because it expects that 1 <= redux_len <= input_ndim.
+             * As input_ndim == 0, then 1 <= redux_len <= 0 is false.
+             * To handle this case we copy input to max and we set argmax to 0.
+             */
+            if (GA_NO_ERROR != GpuArray_setarray(&%(max)s->ga, &%(X)s->ga)) {
+                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to copy input to max when input is a scalar.");
+                %(fail)s
+            }
+            if (GA_NO_ERROR != GpuArray_memset(&%(argmax)s->ga, 0)) {
+                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to set argmax to 0 when input is a scalar.");
+                %(fail)s
+            }
+        } else if (GA_NO_ERROR !=
+            GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, &%(X)s->ga, %(name)s_redux_len, %(name)s_axes_to_reduce)
+        ) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to compute gpuarray maxandargmax.");
+            %(fail)s
+        }
+        """
+        if theano.config.gpuarray.sync:
+            ret += """
+            GpuArray_sync(&%(max)s->ga);
+            GpuArray_sync(&%(argmax)s->ga);
+            """
+        return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
+                      'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
+                      'name': name, 'fail': sub['fail']}
+
+    def c_code_cleanup(self, node, name, inputs, outputs, sub):
+        return """
+        free(%(name)s_output_dims);
+        free(%(name)s_axes_to_reduce);
+        """ % {'name': name, 'X': inputs[0]}
--- a/theano/gpuarray/tests/test_reduction.py
+++ b/theano/gpuarray/tests/test_reduction.py
+from __future__ import print_function, absolute_import, division
+from unittest import TestCase
+import numpy as np
+
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tests.unittest_tools import SkipTest
+
+from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand_gpuarray
+from .. import GpuArrayType
+
+import math
+
+# Number of values to be used in test tensors (except with 0-D tensors!).
+test_size = 10000000
+
+# NB: This order of "unsorted axes" is arbitrary and is here
+# just to have the same informations on profile output
+# from one test to another.
+unsorted_axes = (2, 4, 0, 3, 1)
+
+np.random.seed()
+
+
+def numpy_random_array(shapes):
+    size = 1
+    for dimsize in shapes:
+        size *= dimsize
+    return np.random.normal(size=size).astype(theano.config.floatX).reshape(shapes)
+
+
+def numpy_maxandargmax(X, axis=None):
+    if axis is None:
+        axis = list(range(X.ndim))
+    elif not isinstance(axis, (tuple, list)):
+        axis = [int(axis)]
+    axis = list(set(axis))  # remove duplicated values.
+    axis.sort()
+    axis = tuple(axis)
+    ref_max = np.max(X, axis=axis)
+    # Following code is copied from MaxAndArgmax.perform():
+    # Numpy does not support multiple axes for argmax. Work around.
+    keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype='int64')
+    # Not-reduced axes in front
+    transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
+    kept_shape = transposed_x.shape[:len(keep_axes)]
+    reduced_shape = transposed_x.shape[len(keep_axes):]
+    new_shape = kept_shape + (np.prod(reduced_shape),)
+    new_shape = tuple(int(i) for i in new_shape)
+    reshaped_x = transposed_x.reshape(new_shape)
+    return (ref_max, np.argmax(reshaped_x, axis=-1))
+
+
+def check_if_gpu_maxandargmax_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
+
+
+def check_if_gpu_maxandargmax_not_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
+
+
+class BaseTest:
+    # This attribute must be set in subclasses.
+    tensor_size = None
+    shape = None
+
+    dtype = theano.config.floatX
+
+    def get_shape(self):
+        if self.tensor_size == 0:
+            return []
+        return [int(math.ceil(math.pow(test_size, 1 / self.tensor_size)))] * self.tensor_size
+
+    def setUp(self):
+        if not isinstance(self.tensor_size, int):
+            raise SkipTest("No tensor ndim defined.")
+        if self.tensor_size < 0 or self.tensor_size > 5:
+            raise SkipTest("We allow from 0 (included) to 5 (inclued) dimensons for these tests.")
+        if self.shape is None:
+            self.shape = self.get_shape()
+
+    def get_host_tensor(self):
+        broadcastable = (False,) * self.tensor_size
+        return T.tensor(self.dtype, broadcastable)
+
+    def get_gpu_tensor(self):
+        broadcastable = (False,) * self.tensor_size
+        return GpuArrayType(self.dtype, broadcastable)()
+
+    def get_host_value(self):
+        return numpy_random_array(self.shape)
+
+    def get_gpu_value(self):
+        return rand_gpuarray(*self.shape)
+
+    # NB: In compute_host() and compute_gpu(),
+    # the first call of the theano function should be ignored in profiling,
+    # with Theano config flag profiling.ignore_first_call=True.
+
+    def compute_host(self, test_tensor, axis):
+        M = self.get_host_tensor()
+        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
+                            name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu)
+        check_if_gpu_maxandargmax_not_in_graph(f)
+        f(test_tensor)
+        theano_max, theano_argmax = f(test_tensor)
+        ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
+        utt.assert_allclose(ref_max, theano_max)
+        utt.assert_allclose(ref_argmax, theano_argmax)
+
+    def compute_gpu(self, test_gpu_tensor, test_host_tensor, axis):
+        M = self.get_gpu_tensor()
+        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
+                            name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu)
+        check_if_gpu_maxandargmax_in_graph(f)
+        f(test_gpu_tensor)
+        theano_max, theano_argmax = f(test_gpu_tensor)
+        ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
+        utt.assert_allclose(ref_max, theano_max)
+        utt.assert_allclose(ref_argmax, theano_argmax)
+
+    def compute(self, axis=None):
+        # We want to run CPU op and GPU op on the same tensor randomly generated.
+        test_gpu_tensor = self.get_gpu_value()
+        test_host_tensor = np.asarray(test_gpu_tensor)
+        self.compute_host(test_host_tensor, axis)
+        self.compute_gpu(test_gpu_tensor, test_host_tensor, axis)
+
+    def compute_axis(self, pos):
+        if self.tensor_size != 1 and 0 <= pos < self.tensor_size:
+            self.compute(pos)
+
+    def compute_some_axes(self, count):
+        if 0 <= count < self.tensor_size:
+            self.compute([i for i in unsorted_axes if i < self.tensor_size][:count])
+
+    # Equivalent to test reduction on all axes.
+    def test_none(self):
+        self.compute(None)
+
+    def test_axis_1(self):
+        self.compute_axis(0)
+
+    def test_axis_2(self):
+        self.compute_axis(1)
+
+    def test_axis_3(self):
+        self.compute_axis(2)
+
+    def test_axis_4(self):
+        self.compute_axis(3)
+
+    def test_axis_5(self):
+        self.compute_axis(4)
+
+    # For the tests below, we expect CPU op to run with Python implementation.
+
+    def test_2_axes(self):
+        self.compute_some_axes(2)
+
+    def test_3_axes(self):
+        self.compute_some_axes(3)
+
+    def test_4_axes(self):
+        self.compute_some_axes(4)
+
+
+class TestScalar(BaseTest, TestCase):
+    tensor_size = 0
+
+
+class TestVector(BaseTest, TestCase):
+    tensor_size = 1
+
+
+# Special case
+class TestRow(BaseTest, TestCase):
+    tensor_size = 2
+    shape = [1, test_size]
+
+
+# Special case
+class TestColumn(BaseTest, TestCase):
+    tensor_size = 2
+    shape = [test_size, 1]
+
+
+class TestMatrix(BaseTest, TestCase):
+    tensor_size = 2
+
+
+class TestTensor5(BaseTest, TestCase):
+    tensor_size = 5
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -15,6 +15,7 @@ from theano.compat import izip
 from theano.configparser import config
 from theano import gof
 from theano.gof import Apply, Constant, Op, Variable
+from theano.gof.type import Generic

 from theano.tensor import elemwise
 from theano.tensor.var import (AsTensorError, TensorVariable,
@@ -1181,72 +1182,32 @@ class MaxAndArgmax(Op):
    nin = 2  # tensor, axis
    nout = 2  # max val, max idx
    E_axis = 'invalid axis'
-    __props__ = ()
+    params_type = Generic()
+    __props__ = ('axis',)

-    def make_node(self, x, axis=None):
-        x = _as_tensor_variable(x)
+    def __init__(self, axis):
+        assert isinstance(axis, list)
+        self.axis = tuple(axis)

-        if isinstance(axis, (integer_types, numpy.integer)):
-            axis = [int(axis)]
-        elif isinstance(axis, numpy.ndarray) and axis.ndim == 0:
-            axis = [int(axis)]
-        elif isinstance(axis, (tuple, list, numpy.ndarray)):
-            axis = [int(a) for a in axis]
-            if axis == list(range(x.type.ndim)):
-                axis = None
-        elif isinstance(axis, Variable):
-            if NoneConst.equals(axis):
-                axis = None
-            elif not isinstance(axis, TensorConstant):
-                raise TypeError(
-                    "MaxAndArgmax needs a constant axis. Got %s" % axis)
-            else:
-                assert (axis.dtype.startswith("int") or
-                        axis.dtype.startswith("uint"))
-                if isinstance(axis.data, (integer_types, numpy.integer)) or \
-                   (isinstance(axis.data, numpy.ndarray) and
-                        axis.data.ndim == 0):
-                    axis = [int(axis.data)]
-                elif isinstance(axis.data, (list, numpy.ndarray)):
-                    axis = [int(i) for i in axis.data]
+    def get_params(self, node):
+        return self.axis

-        # Make axis entries non-negative, and sort them
-        if isinstance(axis, list):
-            for idx in xrange(len(axis)):
-                if axis[idx] < 0:
-                    axis[idx] += x.type.ndim
-            axis.sort()
-
-        # Verify that axes are valid
-        all_axes = []
-        if isinstance(axis, list):
-            for ax in axis:
-                if ax < 0 or ax >= x.type.ndim:
-                    raise ValueError(
-                        'Invalid axis: %s (the number of dimensions of the '
-                        'input is: %s)' % (ax, x.type.ndim))
-                if ax not in all_axes:
-                    all_axes.append(ax)
-        else:
-            all_axes = list(range(x.ndim))
-
-        if axis is None or axis == list(range(x.type.ndim)):
-            axis = NoneConst.clone()
-        else:
-            axis = _as_tensor_variable(all_axes)
-            assert axis.ndim == 1
-        inputs = [x, axis]
+    def make_node(self, x):
+        x = _as_tensor_variable(x)

        # We keep the original broadcastable flags for dimensions on which
        # we do not perform the max / argmax.
+        all_axes = set(self.axis)
        broadcastable = [b for i, b in enumerate(x.type.broadcastable)
                         if i not in all_axes]
+        inputs = [x]
        outputs = [tensor(x.type.dtype, broadcastable, name='max'),
                   tensor('int64', broadcastable, name='argmax')]
        return Apply(self, inputs, outputs)

-    def perform(self, node, inp, outs):
-        x, axes = inp
+    def perform(self, node, inp, outs, params):
+        x = inp[0]
+        axes = params
        max, max_idx = outs
        if axes is None:
            axes = tuple(range(x.ndim))
@@ -1269,35 +1230,46 @@ class MaxAndArgmax(Op):
                                     dtype='int64')

    def c_code(self, node, name, inp, out, sub):
-        x, axis = inp
+        if len(self.axis) != 1 and len(self.axis) != node.inputs[0].ndim:
+            raise NotImplementedError("NumPy C-API can compute max and argmax only for 1 axis or for all axes.")
+        x = inp[0]
+        axis = sub['params']
        max, argmax = out
        fail = sub["fail"]
-        if NoneConst.equals(node.inputs[1]):
-            axis_code = "axis = NPY_MAXDIMS;"
-        else:
-            assert node.inputs[1].ndim == 1
-            # Fall back to perform() if there are multiple axes
-            if len(node.inputs[1].data) > 1:
-                raise NotImplementedError()
-            axis_code = """
-            axis = ((dtype_%(axis)s*)PyArray_DATA(%(axis)s))[0];
-            if(axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)){
+        ret = """
+        #if PY_MAJOR_VERSION >= 3
+            #ifndef PyInt_AS_LONG
+                #define PyInt_AS_LONG PyLong_AS_LONG
+            #endif
+        #endif
+
+        int axis;
+
+        if (PyTuple_GET_SIZE(%(axis)s) == PyArray_NDIM(%(x)s)) {
+            axis = NPY_MAXDIMS;
+        } else if(PyTuple_GET_SIZE(%(axis)s) == 1) {
+            PyObject* axis_object = PyTuple_GET_ITEM(%(axis)s, 0);
+            axis = (int)PyInt_AS_LONG(axis_object);
+            Py_XDECREF(axis_object);
+            if (axis > PyArray_NDIM(%(x)s)-1 || axis < -PyArray_NDIM(%(x)s)) {
                PyErr_SetString(PyExc_ValueError,
-                "MaxAndArgmax, bad axis argument");
+                "MaxAndArgmax: bad axis argument");
                %(fail)s
            }
-            """ % locals()
-        ret = """
-        int axis;
+        } else {
+            PyErr_SetString(PyExc_NotImplementedError,
+            "MaxAndArgmax: NumPy C-API can compute max and argmax only for 1 axis or for all axes.");
+            %(fail)s
+        }

        Py_CLEAR(%(max)s);
        Py_CLEAR(%(argmax)s);//todo pass them as out parameter.
-        %(axis_code)s
+
        %(max)s = (PyArrayObject*)PyArray_Max(%(x)s, axis, NULL);
-        if(%(max)s == NULL){
+        if (%(max)s == NULL) {
            %(fail)s;
        }
-        if(!PyArray_CheckExact(%(max)s)){
+        if (!PyArray_CheckExact(%(max)s)) {
            %(max)s = (PyArrayObject*)PyArray_FromAny((PyObject*)%(max)s, NULL, 0, 0, NPY_ARRAY_ENSUREARRAY, NULL);
            if(%(max)s == NULL){
                %(fail)s;
@@ -1305,17 +1277,17 @@ class MaxAndArgmax(Op):
        }

        %(argmax)s = (PyArrayObject*)PyArray_ArgMax(%(x)s, axis, NULL);
-        if(%(argmax)s == NULL){
+        if (%(argmax)s == NULL) {
            Py_CLEAR(%(max)s);
            %(fail)s;
        }
-        if(!PyArray_CheckExact(%(argmax)s)){
+        if (!PyArray_CheckExact(%(argmax)s)) {
            %(argmax)s = (PyArrayObject*)PyArray_FromAny((PyObject*)%(argmax)s, NULL, 0, 0, NPY_ARRAY_ENSUREARRAY, NULL);
            if(%(argmax)s == NULL){
                %(fail)s;
            }
        }
-        if(PyArray_TYPE(%(argmax)s) != NPY_INT64){
+        if (PyArray_TYPE(%(argmax)s) != NPY_INT64) {
            PyObject * tmp = PyArray_Cast(%(argmax)s, NPY_INT64);
            if (NULL == tmp){
                %(fail)s;
@@ -1330,28 +1302,25 @@ class MaxAndArgmax(Op):
        return (4,)

    def infer_shape(self, node, shapes):
-        ishape, axis_shape = shapes
-        axis = node.inputs[1]
-        if axis.data is None:
-            return [(), ()]
-        rval = tuple([ishape[i] for (i, b) in enumerate(
-            node.inputs[0].type.broadcastable) if i not in axis.data])
+        ishape = shapes[0]
+        rval = tuple(ishape[i] for (i, b) in enumerate(
+            node.inputs[0].type.broadcastable) if i not in self.axis)
        return [rval, rval]

    def R_op(self, inputs, eval_points):
        if eval_points[0] is None:
            return [None, None]
-        if not isinstance(inputs[1], theano.Constant):
+        if len(self.axis) != 1:
            raise ValueError(('R_op supported for arg_max only for '
-                              'constant axis!'))
-        if inputs[1].data > 1:
+                              'one axis!'))
+        if self.axis[0] > 1:
            raise ValueError(('R_op supported for arg_max only when '
                              ' axis is 0 or 1'))
        if inputs[0].ndim != 2:
            raise ValueError(('R_op supported for arg_max only when '
                              ' input is a matrix'))
        max_vals, max_pos = self.make_node(*inputs).outputs
-        if inputs[1].data == 0:
+        if self.axis[0] == 0:
            return [eval_points[0][max_pos,
                                   arange(eval_points[0].shape[1])], None]
        else:
@@ -1372,7 +1341,8 @@ class MaxAndArgmax(Op):
        # g_max has one less dimension than x, so you need to complete
        # g_max to x's shape when axis=0 the broadcasting mechanism
        # does it automatically
-        x, axis = inp
+        x = inp[0]
+        axis = _as_tensor_variable(self.axis)
        g_max, g_max_idx = grads

        g_max_disconnected = isinstance(g_max.type, DisconnectedType)
@@ -1382,15 +1352,10 @@ class MaxAndArgmax(Op):
        if g_max_disconnected and g_max_idx_disconnected:
            return [DisconnectedType()(), DisconnectedType()()]

-        axis_grad = grad_undefined(
-            self, 1, axis,
-            "argmax is not defined for non-integer axes so"
-            " argmax(x, axis+eps) is undefined")
-
        # if the max is disconnected but the argmax is not,
        # the gradient on its inputs is zero
        if g_max_disconnected:
-            return [x.zeros_like(), axis_grad]
+            return [x.zeros_like()]
        if NoneConst.equals(axis):
            axis_ = list(range(x.ndim))
        else:
@@ -1414,9 +1379,7 @@ class MaxAndArgmax(Op):

        # Set the grad to the correct position.
        g_x = eq(xmax_pad, x) * g_max_pad
-        return g_x, axis_grad
-
-_max_and_argmax = MaxAndArgmax()
+        return g_x,


 class Argmax(Op):
@@ -1637,8 +1600,39 @@ def max_and_argmax(a, axis=None, keepdims=False):
        will broadcast correctly against the original tensor.

    """
-
-    out, argout = _max_and_argmax(a, axis)
+    # Check axis and convert it to a Python list of integers.
+    # Axis will be used as an op param of MaxAndArgmax.
+    if axis is None:
+        axis = list(range(a.type.ndim))
+    elif (isinstance(axis, (integer_types, numpy.integer)) or
+            (isinstance(axis, numpy.ndarray) and axis.ndim == 0)):
+        axis = [int(axis)]
+    elif isinstance(axis, (tuple, list, numpy.ndarray)):
+        axis = [int(i) for i in axis]
+    elif isinstance(axis, Variable):
+        if NoneConst.equals(axis):
+            axis = list(range(a.type.ndim))
+        elif not isinstance(axis, TensorConstant):
+            raise TypeError("max and argmax computation needs a constant axis. Got %s" % axis)
+        else:
+            assert (axis.dtype.startswith("int") or axis.dtype.startswith("uint"))
+            if (isinstance(axis.data, (integer_types, numpy.integer)) or
+                    (isinstance(axis.data, numpy.ndarray) and axis.data.ndim == 0)):
+                axis = [int(axis.data)]
+            elif isinstance(axis.data, (list, numpy.ndarray)):
+                axis = [int(i) for i in axis.data]
+    if len(axis) == 0:
+        axis = list(range(a.type.ndim))
+    else:
+        for i in range(len(axis)):
+            if axis[i] < 0:
+                axis[i] += a.type.ndim
+            if axis[i] < 0 or axis[i] >= a.type.ndim:
+                raise ValueError("max and argmax computation needs a valid axis number for %d-D tensor. Got %d"
+                                 % (a.type.ndim, axis[i]))
+        axis = list(set(axis))
+        axis.sort()
+    out, argout = MaxAndArgmax(axis)(a)

    if keepdims:
        out = makeKeepDims(a, out, axis)

--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1568,9 +1568,9 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):


 @opt.register_specialize('fast_compile_gpu')
-@gof.local_optimizer([tensor._max_and_argmax])
+@gof.local_optimizer([tensor.MaxAndArgmax])
 def local_argmax_pushdown(node):
-    if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
+    if isinstance(node.op, tensor.MaxAndArgmax) and node.inputs[0].owner and \
            len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \
            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
@@ -1584,23 +1584,24 @@ def local_argmax_pushdown(node):
                "warning set the Theano flags 'warn.argmax_pushdown_bug' "
                "to False")

-    if (node.op == tensor._max_and_argmax and
+    if (isinstance(node.op, tensor.MaxAndArgmax) and
            node.inputs[0].owner and len(node.outputs[0].clients) == 0):
        x_max, x_argmax = node.outputs
-        x, axis = node.inputs
+        x = node.inputs[0]
+        axis = node.op.get_params(node)
        # TODO: Make a list/set of monotonic ops...
        if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
                                      tensor.log, tensor.tanh, sigmoid):
            pre_x, = x.owner.inputs
-            ret = tensor._max_and_argmax(pre_x, axis)
+            ret = tensor.max_and_argmax(pre_x, axis)
            copy_stack_trace(x_max, ret)
            return ret
        if x.owner and x.owner.op == softmax_with_bias:
            pre_x, pre_bias = x.owner.inputs
-            ret = tensor._max_and_argmax(pre_x +
-                                         tensor.DimShuffle(
-                                             pre_bias.broadcastable,
-                                             ('x', 0))(pre_bias), axis)
+            ret = tensor.max_and_argmax(pre_x +
+                                        tensor.DimShuffle(
+                                            pre_bias.broadcastable,
+                                            ('x', 0))(pre_bias), axis)
            # copy both stack traces
            copy_stack_trace(x_max, ret)
            return ret

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -41,8 +41,6 @@ from theano.tensor.elemwise import CAReduce
 from theano.tensor import basic as T
 from theano.tensor import DimShuffle

-from theano.tensor.basic import (get_scalar_constant_value,
-                                 NotScalarConstantError)
 from theano.tensor.opt import register_uncanonicalize
 from theano import scalar as scal

@@ -50,31 +48,19 @@ _logger = logging.getLogger('theano.tensor.opt')


 @register_uncanonicalize
-@gof.local_optimizer([T._max_and_argmax])
+@gof.local_optimizer([T.MaxAndArgmax])
 def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
-    if node.op == T._max_and_argmax:
+    if isinstance(node.op, T.MaxAndArgmax):
+        axis = node.op.get_params(node)
        if len(node.outputs[1].clients) == 0:
-            # MaxAndArgmax support variable axis,
-            # but CAReduce support only constant axis.
-            if node.inputs[1].data is None:
-                axis = None
-            else:
-                try:
-                    axis = get_scalar_constant_value(node.inputs[1])
-                except NotScalarConstantError:
-                    axis = node.inputs[1]
-                    if not isinstance(axis, T.TensorConstant):
-                        return False
-                    axis = axis.data
-
            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]

        if len(node.outputs[0].clients) == 0:
-            return [None, T._argmax(node.inputs[0], node.inputs[1])]
+            return [None, T._argmax(node.inputs[0], axis)]


 @register_uncanonicalize

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -7619,23 +7619,23 @@ class TestInferShape(utt.InferShapeTester):
        # MaxAndArgmax,
        adtens3_val = rand(4, 5, 3)
        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, None),
+                max_and_argmax(adtens3, None),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 0),
+                max_and_argmax(adtens3, 0),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 1),
+                max_and_argmax(adtens3, 1),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 2),
+                max_and_argmax(adtens3, 2),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, [0, 1, 2]),
+                max_and_argmax(adtens3, [0, 1, 2]),
                [adtens3_val], MaxAndArgmax)

        # ARange