Merge pull request #5168 from notoraptor/master

This is my proposal for GpuMaxAndArgmax (issue #1399).

Merge pull request #5168 from notoraptor/master
3347480a · Pascal Lamblin · GitHub · 86fdfdaf · 0f4436f9 · 3347480a
--- a/theano/gpuarray/__init__.py
+++ b/theano/gpuarray/__init__.py
@@ -28,7 +28,7 @@ from .type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
                   GpuArraySharedVariable, gpuarray_shared_constructor,
                   reg_context, get_context, ContextNotDefined, _get_props)
 from .basic_ops import as_gpuarray_variable
-from . import fft, dnn, opt, nerv, extra_ops, multinomial
+from . import fft, dnn, opt, nerv, extra_ops, multinomial, reduction

 def transfer(x, target):
    try:

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -65,6 +65,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
+from .reduction import GpuMaxAndArgmax

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -1775,6 +1776,14 @@ def _scan_type_infer(node):
                            context_name=context_name)
    return typebuild

+
+# Add optimization : maxandargmax (CPU -> GPU)
+@register_opt('fast_compile')
+@op_lifter([tensor.MaxAndArgmax])
+@register_opt2([tensor.MaxAndArgmax], 'fast_compile')
+def local_gpu_maxandargmax(op, context_name, inputs, outputs):
+    return GpuMaxAndArgmax(op.get_params(None))
+
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/reduction.py
+++ b/theano/gpuarray/reduction.py
+from __future__ import print_function, absolute_import, division
+import os
+import theano
+from theano.gof import Op, Apply
+from theano.gof.type import Generic
+
+from .basic_ops import (infer_context_name, as_gpuarray_variable)
+from .type import GpuArrayType
+
+try:
+    import pygpu
+except ImportError as e:
+    pass
+
+
+class GpuMaxAndArgmax(Op):
+    """
+    GPU version of MaxAndArgmax
+
+    """
+    params_type = Generic()
+    __props__ = ('axis',)
+    argmax_dtype = "int64"
+
+    def __init__(self, axis):
+        assert isinstance(axis, (list, tuple))
+        self.axis = tuple(axis)
+
+    def get_params(self, node):
+        return self.axis
+
+    def make_node(self, X):
+        context_name = infer_context_name(X)
+        # We keep the original broadcastable flags for dimensions on which
+        # we do not perform the max / argmax.
+        all_axes = set(self.axis)
+        broadcastable = [b for i, b in enumerate(X.type.broadcastable)
+                         if i not in all_axes]
+        inputs = [as_gpuarray_variable(X, context_name)]
+        outputs = [GpuArrayType(X.type.dtype, broadcastable, context_name=context_name, name='max')(),
+                   GpuArrayType(self.argmax_dtype, broadcastable, context_name=context_name, name='argmax')()]
+        return Apply(self, inputs, outputs)
+
+    def c_headers(self):
+        return ['<numpy_compat.h>', '<gpuarray_helper.h>']
+
+    def c_header_dirs(self):
+        return [pygpu.get_include(), os.path.dirname(__file__)]
+
+    def c_code(self, node, name, input_names, output_names, sub):
+        # Recall: X = input_names[0]
+        # Recall: axes = sub['params']
+        # Recall: max, argmax = output_names
+        # Recall: fail = sub['fail']
+        max_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
+        argmax_typecode = pygpu.gpuarray.dtype_to_typecode(self.argmax_dtype)
+        ret = """
+        #if PY_MAJOR_VERSION >= 3
+            #ifndef PyInt_AS_LONG
+                #define PyInt_AS_LONG PyLong_AS_LONG
+            #endif
+        #endif
+
+        unsigned  %(name)s_redux_len = PyTuple_GET_SIZE(%(axes)s);
+        unsigned* %(name)s_axes_to_reduce = (unsigned*)malloc(%(name)s_redux_len * sizeof(unsigned));
+        for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
+            PyObject* axis_object = PyTuple_GET_ITEM(%(axes)s, i);
+            %(name)s_axes_to_reduce[i] = (unsigned) PyInt_AS_LONG(axis_object);
+            Py_XDECREF(axis_object);
+        }
+
+        size_t  %(name)s_input_ndim = PyGpuArray_NDIM(%(X)s);
+        size_t  %(name)s_output_ndim = %(name)s_input_ndim - %(name)s_redux_len;
+        size_t* %(name)s_output_dims = (size_t*)malloc(%(name)s_output_ndim * sizeof(size_t));
+        if (%(name)s_redux_len == 1) {
+            for (unsigned i = 0; i < %(name)s_axes_to_reduce[0]; ++i) {
+                %(name)s_output_dims[i] = PyGpuArray_DIM(%(X)s, i);
+            }
+            for (unsigned i = %(name)s_axes_to_reduce[0] + 1; i < %(name)s_input_ndim; ++i) {
+                %(name)s_output_dims[i-1] = PyGpuArray_DIM(%(X)s, i);
+            }
+        } else {
+            int64_t current_input_pos = -1;
+            int64_t current_output_pos = -1;
+            for (unsigned i = 0; i < %(name)s_redux_len; ++i) {
+                for (++current_input_pos; current_input_pos < %(name)s_axes_to_reduce[i]; ++current_input_pos) {
+                    %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
+                }
+            }
+            for (++current_input_pos; current_input_pos < %(name)s_input_ndim; ++current_input_pos) {
+                %(name)s_output_dims[++current_output_pos] = PyGpuArray_DIM(%(X)s, current_input_pos);
+            }
+        }
+
+        if (theano_prep_output(&%(max)s, %(name)s_output_ndim, %(name)s_output_dims, %(max_typecode)s, GA_C_ORDER, %(X)s->context)) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare max output.");
+            %(fail)s
+        }
+        if (theano_prep_output(&%(argmax)s, %(name)s_output_ndim, %(name)s_output_dims, %(argmax_typecode)s, GA_C_ORDER, %(X)s->context)) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to prepare argmax output.");
+            %(fail)s
+        }
+
+        if (%(name)s_input_ndim == 0) {
+            /* GpuArray_maxandargmax can't handle a 0-d array
+             * because it expects that 1 <= redux_len <= input_ndim.
+             * As input_ndim == 0, then 1 <= redux_len <= 0 is false.
+             * To handle this case we copy input to max and we set argmax to 0.
+             */
+            if (GA_NO_ERROR != GpuArray_setarray(&%(max)s->ga, &%(X)s->ga)) {
+                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to copy input to max when input is a scalar.");
+                %(fail)s
+            }
+            if (GA_NO_ERROR != GpuArray_memset(&%(argmax)s->ga, 0)) {
+                PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to set argmax to 0 when input is a scalar.");
+                %(fail)s
+            }
+        } else if (GA_NO_ERROR !=
+            GpuArray_maxandargmax(&%(max)s->ga, &%(argmax)s->ga, &%(X)s->ga, %(name)s_redux_len, %(name)s_axes_to_reduce)
+        ) {
+            PyErr_SetString(PyExc_RuntimeError, "GpuMaxAndArgmax: unable to compute gpuarray maxandargmax.");
+            %(fail)s
+        }
+        """
+        if theano.config.gpuarray.sync:
+            ret += """
+            GpuArray_sync(&%(max)s->ga);
+            GpuArray_sync(&%(argmax)s->ga);
+            """
+        return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
+                      'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
+                      'name': name, 'fail': sub['fail']}
+
+    def c_code_cleanup(self, node, name, inputs, outputs, sub):
+        return """
+        free(%(name)s_output_dims);
+        free(%(name)s_axes_to_reduce);
+        """ % {'name': name, 'X': inputs[0]}
--- a/theano/gpuarray/tests/test_reduction.py
+++ b/theano/gpuarray/tests/test_reduction.py
+from __future__ import print_function, absolute_import, division
+from unittest import TestCase
+import numpy as np
+
+import theano
+import theano.tensor as T
+from theano.tests import unittest_tools as utt
+from theano.tests.unittest_tools import SkipTest
+
+from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand_gpuarray
+from .. import GpuArrayType
+
+import math
+
+# Number of values to be used in test tensors (except with 0-D tensors!).
+test_size = 10000000
+
+# NB: This order of "unsorted axes" is arbitrary and is here
+# just to have the same informations on profile output
+# from one test to another.
+unsorted_axes = (2, 4, 0, 3, 1)
+
+np.random.seed()
+
+
+def numpy_random_array(shapes):
+    size = 1
+    for dimsize in shapes:
+        size *= dimsize
+    return np.random.normal(size=size).astype(theano.config.floatX).reshape(shapes)
+
+
+def numpy_maxandargmax(X, axis=None):
+    if axis is None:
+        axis = list(range(X.ndim))
+    elif not isinstance(axis, (tuple, list)):
+        axis = [int(axis)]
+    axis = list(set(axis))  # remove duplicated values.
+    axis.sort()
+    axis = tuple(axis)
+    ref_max = np.max(X, axis=axis)
+    # Following code is copied from MaxAndArgmax.perform():
+    # Numpy does not support multiple axes for argmax. Work around.
+    keep_axes = np.array([i for i in range(X.ndim) if i not in axis], dtype='int64')
+    # Not-reduced axes in front
+    transposed_x = np.transpose(X, np.concatenate((keep_axes, axis)))
+    kept_shape = transposed_x.shape[:len(keep_axes)]
+    reduced_shape = transposed_x.shape[len(keep_axes):]
+    new_shape = kept_shape + (np.prod(reduced_shape),)
+    new_shape = tuple(int(i) for i in new_shape)
+    reshaped_x = transposed_x.reshape(new_shape)
+    return (ref_max, np.argmax(reshaped_x, axis=-1))
+
+
+def check_if_gpu_maxandargmax_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) > 0
+
+
+def check_if_gpu_maxandargmax_not_in_graph(theano_function):
+    assert len([node for node in theano_function.maker.fgraph.apply_nodes
+                if isinstance(node.op, theano.gpuarray.reduction.GpuMaxAndArgmax)]) == 0
+
+
+class BaseTest:
+    # This attribute must be set in subclasses.
+    tensor_size = None
+    shape = None
+
+    dtype = theano.config.floatX
+
+    def get_shape(self):
+        if self.tensor_size == 0:
+            return []
+        return [int(math.ceil(math.pow(test_size, 1 / self.tensor_size)))] * self.tensor_size
+
+    def setUp(self):
+        if not isinstance(self.tensor_size, int):
+            raise SkipTest("No tensor ndim defined.")
+        if self.tensor_size < 0 or self.tensor_size > 5:
+            raise SkipTest("We allow from 0 (included) to 5 (inclued) dimensons for these tests.")
+        if self.shape is None:
+            self.shape = self.get_shape()
+
+    def get_host_tensor(self):
+        broadcastable = (False,) * self.tensor_size
+        return T.tensor(self.dtype, broadcastable)
+
+    def get_gpu_tensor(self):
+        broadcastable = (False,) * self.tensor_size
+        return GpuArrayType(self.dtype, broadcastable)()
+
+    def get_host_value(self):
+        return numpy_random_array(self.shape)
+
+    def get_gpu_value(self):
+        return rand_gpuarray(*self.shape)
+
+    # NB: In compute_host() and compute_gpu(),
+    # the first call of the theano function should be ignored in profiling,
+    # with Theano config flag profiling.ignore_first_call=True.
+
+    def compute_host(self, test_tensor, axis):
+        M = self.get_host_tensor()
+        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
+                            name='shape:' + str(test_tensor.shape) + '/axis:' + str(axis) + '/HOST', mode=mode_without_gpu)
+        check_if_gpu_maxandargmax_not_in_graph(f)
+        f(test_tensor)
+        theano_max, theano_argmax = f(test_tensor)
+        ref_max, ref_argmax = numpy_maxandargmax(test_tensor, axis=axis)
+        utt.assert_allclose(ref_max, theano_max)
+        utt.assert_allclose(ref_argmax, theano_argmax)
+
+    def compute_gpu(self, test_gpu_tensor, test_host_tensor, axis):
+        M = self.get_gpu_tensor()
+        f = theano.function([M], [T.max(M, axis=axis), T.argmax(M, axis=axis)],
+                            name='shape:' + str(test_gpu_tensor.shape) + '/axis:' + str(axis) + '/GPU', mode=mode_with_gpu)
+        check_if_gpu_maxandargmax_in_graph(f)
+        f(test_gpu_tensor)
+        theano_max, theano_argmax = f(test_gpu_tensor)
+        ref_max, ref_argmax = numpy_maxandargmax(test_host_tensor, axis=axis)
+        utt.assert_allclose(ref_max, theano_max)
+        utt.assert_allclose(ref_argmax, theano_argmax)
+
+    def compute(self, axis=None):
+        # We want to run CPU op and GPU op on the same tensor randomly generated.
+        test_gpu_tensor = self.get_gpu_value()
+        test_host_tensor = np.asarray(test_gpu_tensor)
+        self.compute_host(test_host_tensor, axis)
+        self.compute_gpu(test_gpu_tensor, test_host_tensor, axis)
+
+    def compute_axis(self, pos):
+        if self.tensor_size != 1 and 0 <= pos < self.tensor_size:
+            self.compute(pos)
+
+    def compute_some_axes(self, count):
+        if 0 <= count < self.tensor_size:
+            self.compute([i for i in unsorted_axes if i < self.tensor_size][:count])
+
+    # Equivalent to test reduction on all axes.
+    def test_none(self):
+        self.compute(None)
+
+    def test_axis_1(self):
+        self.compute_axis(0)
+
+    def test_axis_2(self):
+        self.compute_axis(1)
+
+    def test_axis_3(self):
+        self.compute_axis(2)
+
+    def test_axis_4(self):
+        self.compute_axis(3)
+
+    def test_axis_5(self):
+        self.compute_axis(4)
+
+    # For the tests below, we expect CPU op to run with Python implementation.
+
+    def test_2_axes(self):
+        self.compute_some_axes(2)
+
+    def test_3_axes(self):
+        self.compute_some_axes(3)
+
+    def test_4_axes(self):
+        self.compute_some_axes(4)
+
+
+class TestScalar(BaseTest, TestCase):
+    tensor_size = 0
+
+
+class TestVector(BaseTest, TestCase):
+    tensor_size = 1
+
+
+# Special case
+class TestRow(BaseTest, TestCase):
+    tensor_size = 2
+    shape = [1, test_size]
+
+
+# Special case
+class TestColumn(BaseTest, TestCase):
+    tensor_size = 2
+    shape = [test_size, 1]
+
+
+class TestMatrix(BaseTest, TestCase):
+    tensor_size = 2
+
+
+class TestTensor5(BaseTest, TestCase):
+    tensor_size = 5
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/nnet/nnet.py
+++ b/theano/tensor/nnet/nnet.py
@@ -1568,9 +1568,9 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):


 @opt.register_specialize('fast_compile_gpu')
-@gof.local_optimizer([tensor._max_and_argmax])
+@gof.local_optimizer([tensor.MaxAndArgmax])
 def local_argmax_pushdown(node):
-    if node.op == tensor._max_and_argmax and node.inputs[0].owner and \
+    if isinstance(node.op, tensor.MaxAndArgmax) and node.inputs[0].owner and \
            len(node.outputs[0].clients) > 0 and node.inputs[0].owner.op in \
            (softmax_op, softplus, tensor.exp, tensor.log, tensor.tanh, sigmoid,
             softmax_with_bias):
@@ -1584,23 +1584,24 @@ def local_argmax_pushdown(node):
                "warning set the Theano flags 'warn.argmax_pushdown_bug' "
                "to False")

-    if (node.op == tensor._max_and_argmax and
+    if (isinstance(node.op, tensor.MaxAndArgmax) and
            node.inputs[0].owner and len(node.outputs[0].clients) == 0):
        x_max, x_argmax = node.outputs
-        x, axis = node.inputs
+        x = node.inputs[0]
+        axis = node.op.get_params(node)
        # TODO: Make a list/set of monotonic ops...
        if x.owner and x.owner.op in (softmax_op, softplus, tensor.exp,
                                      tensor.log, tensor.tanh, sigmoid):
            pre_x, = x.owner.inputs
-            ret = tensor._max_and_argmax(pre_x, axis)
+            ret = tensor.max_and_argmax(pre_x, axis)
            copy_stack_trace(x_max, ret)
            return ret
        if x.owner and x.owner.op == softmax_with_bias:
            pre_x, pre_bias = x.owner.inputs
-            ret = tensor._max_and_argmax(pre_x +
-                                         tensor.DimShuffle(
-                                             pre_bias.broadcastable,
-                                             ('x', 0))(pre_bias), axis)
+            ret = tensor.max_and_argmax(pre_x +
+                                        tensor.DimShuffle(
+                                            pre_bias.broadcastable,
+                                            ('x', 0))(pre_bias), axis)
            # copy both stack traces
            copy_stack_trace(x_max, ret)
            return ret

--- a/theano/tensor/opt_uncanonicalize.py
+++ b/theano/tensor/opt_uncanonicalize.py
@@ -41,8 +41,6 @@ from theano.tensor.elemwise import CAReduce
 from theano.tensor import basic as T
 from theano.tensor import DimShuffle

-from theano.tensor.basic import (get_scalar_constant_value,
-                                 NotScalarConstantError)
 from theano.tensor.opt import register_uncanonicalize
 from theano import scalar as scal

@@ -50,31 +48,19 @@ _logger = logging.getLogger('theano.tensor.opt')


 @register_uncanonicalize
-@gof.local_optimizer([T._max_and_argmax])
+@gof.local_optimizer([T.MaxAndArgmax])
 def local_max_and_argmax(node):
    """
    If we don't use the argmax, change it to a max only.
    """
-    if node.op == T._max_and_argmax:
+    if isinstance(node.op, T.MaxAndArgmax):
+        axis = node.op.get_params(node)
        if len(node.outputs[1].clients) == 0:
-            # MaxAndArgmax support variable axis,
-            # but CAReduce support only constant axis.
-            if node.inputs[1].data is None:
-                axis = None
-            else:
-                try:
-                    axis = get_scalar_constant_value(node.inputs[1])
-                except NotScalarConstantError:
-                    axis = node.inputs[1]
-                    if not isinstance(axis, T.TensorConstant):
-                        return False
-                    axis = axis.data
-
            new = CAReduce(scal.maximum, axis)(node.inputs[0])
            return [new, None]

        if len(node.outputs[0].clients) == 0:
-            return [None, T._argmax(node.inputs[0], node.inputs[1])]
+            return [None, T._argmax(node.inputs[0], axis)]


 @register_uncanonicalize

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -7619,23 +7619,23 @@ class TestInferShape(utt.InferShapeTester):
        # MaxAndArgmax,
        adtens3_val = rand(4, 5, 3)
        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, None),
+                max_and_argmax(adtens3, None),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 0),
+                max_and_argmax(adtens3, 0),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 1),
+                max_and_argmax(adtens3, 1),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, 2),
+                max_and_argmax(adtens3, 2),
                [adtens3_val], MaxAndArgmax)

        self._compile_and_check([adtens3],
-                MaxAndArgmax()(adtens3, [0, 1, 2]),
+                max_and_argmax(adtens3, [0, 1, 2]),
                [adtens3_val], MaxAndArgmax)

        # ARange