Add optimizations for SUM(ABS(X)).

c2c28793 · notoraptor · 53bd748f · c2c28793 · c2c28793 · c2c28793
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3755,13 +3755,18 @@ def local_dnn_reduction(node):
    scal = node.op.scalar_op.name
    post = _identity
-    if (isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr) and
+    if node.op.pre_scalar_op is not None:
-            isinstance(node.op.scalar_op, theano.scalar.basic.Add)):
+        # Might want to handle absmax, avg, and other cases for (norm1, norm2) here
-        scal = 'norm2'
+        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
-        post = _square
+            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
-    elif node.op.pre_scalar_op is not None:
+                scal = 'norm2'
-        # Might want to handle absmax, avg, norm1, and other cases of norm2 here
+                post = _square
-        return
+            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
+                scal = 'norm1'
+            else:
+                return
+        else:
+            return
    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
 def local_gpu_elemwise_careduce(node):
    """
    Merge some GpuCAReduceCuda and GPUElemwise.
+    Currently merged:
+     - SUM(X^2)
+     - SUM(ABS(X))
    """
    if (isinstance(node.op, GpuCAReduceCuda) and
@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
            # automatically add more case, as some like trigonometic
            # operation with some reduction pattern will probably results
            # in slow down.
-            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
+            isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
+                                                           scalar.basic.Abs))):
        inp = node.inputs[0].owner.inputs[0]
        props = node.op._props_dict()
-        props["pre_scalar_op"] = scalar.basic.sqr
+        props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
        with inherit_stack_trace(node.outputs):
            out = GpuCAReduceCuda(**props)(inp)
            return [out]

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1583,6 +1583,20 @@ def test_dnn_reduction_sum_squares():
        utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
+def test_dnn_reduction_sum_abs():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = abs(M).sum(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -362,6 +362,7 @@ def test_pdbbreakpoint_op():
 def test_local_gpu_elemwise_careduce():
    mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
    x = theano.tensor.matrix()
    o = (x * x).sum()
    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
    topo = f.maker.fgraph.toposort()
@@ -381,6 +382,26 @@ def test_local_gpu_elemwise_careduce():
    assert _check_stack_trace(f)
    utt.assert_allclose(f(data), (data * data).sum(axis=1))
+    #
+    o = abs(x).sum()
+    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert isinstance(topo[1].op, GpuCAReduceCuda)
+    assert topo[1].op.pre_scalar_op == theano.scalar.abs_
+    assert _check_stack_trace(f)
+    data = np.random.rand(3, 4).astype(theano.config.floatX)
+    utt.assert_allclose(f(data), np.abs(data).sum())
+    o = abs(x).sum(axis=1)
+    f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert isinstance(topo[1].op, GpuCAReduceCuda)
+    assert topo[1].op.pre_scalar_op == theano.scalar.abs_
+    assert _check_stack_trace(f)
+    utt.assert_allclose(f(data), np.abs(data).sum(axis=1))
 def test_local_lift_dot22scalar():
    x = tensor.matrix()