Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2

Optimize SUM(x^2), SUM(ABS(X)) and MAX(ABS(X)) to cuDNN reduction.

Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2
184216ae · abergeron · GitHub · e6acc109 · 0d5f6041 · 184216ae
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase):
  assert(dims[2] == 1);
  assert(dims[1] == 1);
  %(b)s = pygpu_view(%(w)s, Py_None);
-  %(b)s->ga.offset = off;
+  %(b)s->ga.offset += off;
  %(b)s->ga.dimensions[0] = dims[0];
  GpuArray_fix_flags(&%(b)s->ga);
  bshp = dims[0];
@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase):
  assert(dims[2] == 1);
  // We assume that the typecode matches
  %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
-  %(m)s->ga.offset = off;
+  %(m)s->ga.offset += off;
  assert(dims[0] %% bshp == 0);
  %(m)s->ga.dimensions[0] = dims[0] / bshp;
  %(m)s->ga.dimensions[1] = bshp;
@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase):
        return code

    def c_code_cache_version(self):
-        return (3, version())
+        return (4, version())


 def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node):
            node.op.acc_dtype == 'float64'):
        return

+    def _identity(a):
+        return a
+
+    def _square(a):
+        return GpuElemwise(theano.scalar.basic.sqr)(a)
+
+    scal = node.op.scalar_op.name
+    post = _identity
+
    if node.op.pre_scalar_op is not None:
-        # Might want to handle absmax, avg, norm1, norm2 here
-        return
+        # Might want to handle absmax, avg, and other cases for (norm1, norm2) here
+        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
+            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
+                scal = 'norm2'
+                post = _square
+            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
+                scal = 'norm1'
+            else:
+                return
+        elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
+                isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
+            scal = 'absmax'
+        else:
+            return

-    if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
+    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return

    with inherit_stack_trace(node.outputs):
-        return (GpuDnnReduction(node.op.scalar_op.name,
-                                node.op.axis,
-                                node.op.acc_dtype,
-                                node.op.dtype,
-                                False)(node.inputs[0]),)
+        ret = GpuDnnReduction(scal,
+                              node.op.axis,
+                              node.op.acc_dtype,
+                              node.op.dtype,
+                              False)(node.inputs[0])
+        return [post(ret)]


 @register_opt('cudnn')

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            return False
        x, = inputs
        idtype = x.dtype
-        adtype = getattr(op, 'acc_dtype', None)
+        adtype = getattr(op, 'acc_dtype', idtype)
        odtype = getattr(op, 'dtype', outputs[0].dtype)

        # Force accumulator to float32 for float32 inputs since tree
@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
 def local_gpu_elemwise_careduce(node):
    """
    Merge some GpuCAReduceCuda and GPUElemwise.
+    Currently merged:
+     - SUM(X^2)
+     - SUM(ABS(X))

    """
    if (isinstance(node.op, GpuCAReduceCuda) and
@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
            # automatically add more case, as some like trigonometic
            # operation with some reduction pattern will probably results
            # in slow down.
-            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
+            isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
+                                                           scalar.basic.Abs))):
        inp = node.inputs[0].owner.inputs[0]
        props = node.op._props_dict()
-        props["pre_scalar_op"] = scalar.basic.sqr
+        props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
        with inherit_stack_trace(node.outputs):
            out = GpuCAReduceCuda(**props)(inp)
            return [out]

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt():
        yield dnn_reduction, 2, idtype, adtype, odtype


+def test_dnn_reduction_sum_squares():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = (M**2).sum(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm2'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
+
+
+def test_dnn_reduction_sum_abs():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = abs(M).sum(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
+
+
+def test_dnn_reduction_absmax():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = abs(M).max(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'absmax'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
+
+
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op():


 def test_local_gpu_elemwise_careduce():
+    mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
    x = theano.tensor.matrix()
-    o = (x * x).sum()
-    f = theano.function([x], o, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    assert _check_stack_trace(f)
-    data = np.random.rand(3, 4).astype(theano.config.floatX)
-    utt.assert_allclose(f(data), (data * data).sum())

-    o = (x * x).sum(axis=1)
-    f = theano.function([x], o, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    assert _check_stack_trace(f)
-    utt.assert_allclose(f(data), (data * data).sum(axis=1))
+    def fn_sum_square(x, axis):
+        return (x * x).sum(axis=axis)
+
+    def fn_sum_abs(x, axis):
+        return abs(x).sum(axis=axis)
+
+    def fn_max_abs(x, axis):
+        return abs(x).max(axis=axis)
+
+    for fn, pre_scalar_op in ((fn_sum_square, theano.scalar.sqr),
+                              (fn_sum_abs, theano.scalar.abs_),
+                              (fn_max_abs, theano.scalar.abs_)):
+        for axis in (None, 0, 1):
+            o = fn(x, axis)
+            f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
+            topo = f.maker.fgraph.toposort()
+            assert len(topo) == 3
+            assert isinstance(topo[1].op, GpuCAReduceCuda)
+            assert topo[1].op.pre_scalar_op == pre_scalar_op
+            assert _check_stack_trace(f)
+            data = np.random.rand(3, 4).astype(theano.config.floatX)
+            utt.assert_allclose(fn(data, axis), f(data))


 def test_local_lift_dot22scalar():

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
                                    Prod, ProdWithoutZeros)
 from theano.tests import unittest_tools
 from theano.tests.unittest_tools import attr
+import theano.tests.unittest_tools as utt


 def FunctionGraph(i, o):
@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                    try:
                        f_xv = f(xv)
                        self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
-                        self.assertTrue(np.allclose(f_xv, zv),
-                                        (f_xv, zv, xsh, tosum))
+                        utt.assert_allclose(zv, f_xv)
                    except NotImplementedError:
                        # GpuCAReduce don't implement all cases when size is 0
                        assert xv.size == 0