Optimize MAX(ABS(X)).

91bc16c3 · notoraptor · c2c28793 · 91bc16c3 · 91bc16c3 · 91bc16c3
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3765,6 +3765,9 @@ def local_dnn_reduction(node):
                scal = 'norm1'
            else:
                return
+        elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
+                isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
+            scal = 'absmax'
        else:
            return

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            return False
        x, = inputs
        idtype = x.dtype
-        adtype = getattr(op, 'acc_dtype', None)
+        adtype = getattr(op, 'acc_dtype', idtype)
        odtype = getattr(op, 'dtype', outputs[0].dtype)
        # Force accumulator to float32 for float32 inputs since tree

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1597,6 +1597,20 @@ def test_dnn_reduction_sum_abs():
        utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
+def test_dnn_reduction_absmax():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = abs(M).max(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'absmax'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -363,44 +363,28 @@ def test_local_gpu_elemwise_careduce():
    mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
    x = theano.tensor.matrix()
-    o = (x * x).sum()
+    def fn_sum_square(x, axis):
-    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
+        return (x * x).sum(axis=axis)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
+    def fn_sum_abs(x, axis):
-    assert isinstance(topo[1].op, GpuCAReduceCuda)
+        return abs(x).sum(axis=axis)
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    assert _check_stack_trace(f)
+    def fn_max_abs(x, axis):
-    data = np.random.rand(3, 4).astype(theano.config.floatX)
+        return abs(x).max(axis=axis)
-    utt.assert_allclose(f(data), (data * data).sum())
+    for fn, pre_scalar_op in ((fn_sum_square, theano.scalar.sqr),
-    o = (x * x).sum(axis=1)
+                              (fn_sum_abs, theano.scalar.abs_),
-    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
+                              (fn_max_abs, theano.scalar.abs_)):
-    topo = f.maker.fgraph.toposort()
+        for axis in (None, 0, 1):
-    assert len(topo) == 3
+            o = fn(x, axis)
-    assert isinstance(topo[1].op, GpuCAReduceCuda)
+            f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+            topo = f.maker.fgraph.toposort()
-    assert _check_stack_trace(f)
+            assert len(topo) == 3
-    utt.assert_allclose(f(data), (data * data).sum(axis=1))
+            assert isinstance(topo[1].op, GpuCAReduceCuda)
+            assert topo[1].op.pre_scalar_op == pre_scalar_op
-    #
+            assert _check_stack_trace(f)
-    o = abs(x).sum()
+            data = np.random.rand(3, 4).astype(theano.config.floatX)
-    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
+            utt.assert_allclose(fn(data, axis), f(data))
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert isinstance(topo[1].op, GpuCAReduceCuda)
-    assert topo[1].op.pre_scalar_op == theano.scalar.abs_
-    assert _check_stack_trace(f)
-    data = np.random.rand(3, 4).astype(theano.config.floatX)
-    utt.assert_allclose(f(data), np.abs(data).sum())
-    o = abs(x).sum(axis=1)
-    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert isinstance(topo[1].op, GpuCAReduceCuda)
-    assert topo[1].op.pre_scalar_op == theano.scalar.abs_
-    assert _check_stack_trace(f)
-    utt.assert_allclose(f(data), np.abs(data).sum(axis=1))
 def test_local_lift_dot22scalar():