Optimize SUM(x^2) to cuDNN reduction.

8655069d · notoraptor · e6acc109 · 8655069d · 8655069d
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -3746,19 +3746,33 @@ def local_dnn_reduction(node):
            node.op.acc_dtype == 'float64'):
        return

-    if node.op.pre_scalar_op is not None:
-        # Might want to handle absmax, avg, norm1, norm2 here
+    def _identity(a):
+        return a
+
+    def _square(a):
+        return GpuElemwise(theano.scalar.basic.sqr)(a)
+
+    scal = node.op.scalar_op.name
+    post = _identity
+
+    if (isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr) and
+            isinstance(node.op.scalar_op, theano.scalar.basic.Add)):
+        scal = 'norm2'
+        post = _square
+    elif node.op.pre_scalar_op is not None:
+        # Might want to handle absmax, avg, norm1, and other cases of norm2 here
        return

-    if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
+    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return

    with inherit_stack_trace(node.outputs):
-        return (GpuDnnReduction(node.op.scalar_op.name,
-                                node.op.axis,
-                                node.op.acc_dtype,
-                                node.op.dtype,
-                                False)(node.inputs[0]),)
+        ret = GpuDnnReduction(scal,
+                              node.op.axis,
+                              node.op.acc_dtype,
+                              node.op.dtype,
+                              False)(node.inputs[0])
+        return [post(ret)]


 @register_opt('cudnn')

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1569,6 +1569,19 @@ def test_dnn_reduction_opt():
        yield dnn_reduction, 2, idtype, adtype, odtype


+def test_dnn_reduction_sum_squares():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+
+    M = T.matrix()
+    out = (M**2).sum()
+    f = theano.function([M], out, mode=mode_with_gpu)
+    assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm2'
+               for node in f.maker.fgraph.apply_nodes)
+    M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+    utt.assert_allclose((M_val**2).sum(), f(M_val))
+
+
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),