Merge pull request #6088 from nouiz/float16

Fix opt crash in float16 and enable C code for MaxAndArgmax and Argmax

Merge pull request #6088 from nouiz/float16
66277226 · Frédéric Bastien · GitHub · 72623e6c · d40b9ca2 · 66277226
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1911,12 +1911,11 @@ def local_gpu_elemwise_careduce(node):
            # operation with some reduction pattern will probably results
            # in slow down.
            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
-        op = node.op
        inp = node.inputs[0].owner.inputs[0]
-        return [GpuCAReduceCuda(scalar_op=op.scalar_op,
-                                axis=op.axis,
-                                reduce_mask=op.reduce_mask,
-                                pre_scalar_op=scalar.basic.sqr)(inp)]
+        props = node.op._props_dict()
+        props["pre_scalar_op"] = scalar.basic.sqr
+        out = GpuCAReduceCuda(**props)(inp)
+        return [out]


 @local_optimizer(None)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -1219,6 +1219,7 @@ class MaxAndArgmax(Op):
    E_axis = 'invalid axis'
    params_type = Generic()
    __props__ = ('axis',)
+    _f16_ok = True

    def __init__(self, axis):
        assert isinstance(axis, list)
@@ -1427,6 +1428,7 @@ class Argmax(Op):
    nout = 1
    E_axis = 'invalid axis'
    __props__ = ()
+    _f16_ok = True

    def make_node(self, x, axis=None):
        x = _as_tensor_variable(x)

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -41,7 +41,7 @@ from theano.tensor import (
    inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
    Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor,
    tensor_copy, tensordot, TensorType, Tri, tri, tril, triu, unbroadcast,
-    var, Join, shape, MaxAndArgmax, lscalar, zvector, exp,
+    var, Argmax, Join, shape, MaxAndArgmax, lscalar, zvector, exp,
    get_scalar_constant_value, ivector, reshape, scalar_from_tensor, scal,
    iscalars, arange, dscalars, fvector, imatrix, numeric_grad,
    opt, lvector, true_div, max, min, Split, roll,
@@ -106,8 +106,11 @@ def inplace_func(inputs, outputs, mode=None, allow_input_downcast=False,
                    name=name)


-def eval_outputs(outputs):
-    variables = inplace_func([], outputs)()
+def eval_outputs(outputs, ops=(), mode=None):
+    f = inplace_func([], outputs, mode=mode)
+    variables = f()
+    if ops:
+        assert any(isinstance(node.op, ops) for node in f.maker.fgraph.apply_nodes)
    if isinstance(variables, (tuple, list)) and len(variables) == 1:
        return variables[0]
    return variables
@@ -3106,6 +3109,21 @@ class T_max_and_argmax(unittest.TestCase):
            v_shape = eval_outputs(max_and_argmax(n, axis)[0].shape)
            assert tuple(v_shape) == np.max(data, np_axis).shape

+    def test2_float16(self):
+        # Test negative values and bigger range to make sure numpy don't do the argmax as on uint16
+        data = (rand(20, 30).astype("float16") - 0.5) * 20
+        n = shared(data)
+        for (axis, np_axis) in [(-1, -1), (0, 0), (1, 1), (None, None),
+                                ([0, 1], None), ([1, 0], None),
+                                (NoneConst.clone(), None),
+                                (constant(0), 0)]:
+            v, i = eval_outputs(max_and_argmax(n, axis), (MaxAndArgmax,))
+            assert i.dtype == 'int64'
+            self.assertTrue(np.all(v == np.max(data, np_axis)))
+            self.assertTrue(np.all(i == np.argmax(data, np_axis)))
+            v_shape = eval_outputs(max_and_argmax(n, axis)[0].shape)
+            assert tuple(v_shape) == np.max(data, np_axis).shape
+
    def test2_invalid(self):
        n = as_tensor_variable(rand(2, 3))
        # Silence expected error messages
@@ -3321,6 +3339,19 @@ class T_argmin_argmax(unittest.TestCase):
                v_shape = eval_outputs(fct(n, axis).shape)
                assert tuple(v_shape) == nfct(data, np_axis).shape

+    def test2_float16(self):
+        # Test negative values and bigger range to make sure numpy don't do the argmax as on uint16
+        data = (rand(20, 30).astype("float16") - 0.5) * 20
+        n = shared(data)
+        mode = get_default_mode().including("local_max_and_argmax", "uncanonicalize")
+        for fct, nfct in [(argmax, np.argmax), (argmin, np.argmin)]:
+            for (axis, np_axis) in [(-1, -1), (0, 0), (1, 1), (None, None),
+                                    ([0, 1], None), ([1, 0], None)]:
+                v = eval_outputs(fct(n, axis), (Argmax,), mode=mode)
+                self.assertTrue(np.all(v == nfct(data, np_axis)))
+                v_shape = eval_outputs(fct(n, axis).shape, mode=mode)
+                assert tuple(v_shape) == nfct(data, np_axis).shape
+
    def test2_invalid(self):
        for fct, nfct in [(argmax, np.argmax), (argmin, np.argmin)]:
            n = as_tensor_variable(rand(2, 3))