Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2

Optimize SUM(x^2), SUM(ABS(X)) and MAX(ABS(X)) to cuDNN reduction.

Merge pull request #6496 from notoraptor/optimize-sum-squares-to-cudnn-2
184216ae · abergeron · GitHub · e6acc109 · 0d5f6041 · 184216ae
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -2313,7 +2313,7 @@ class _RNNSplitParams(DnnBase):
  assert(dims[2] == 1);
  assert(dims[1] == 1);
  %(b)s = pygpu_view(%(w)s, Py_None);
-  %(b)s->ga.offset = off;
+  %(b)s->ga.offset += off;
  %(b)s->ga.dimensions[0] = dims[0];
  GpuArray_fix_flags(&%(b)s->ga);
  bshp = dims[0];
@@ -2343,7 +2343,7 @@ class _RNNSplitParams(DnnBase):
  assert(dims[2] == 1);
  // We assume that the typecode matches
  %(m)s = pygpu_reshape(%(w)s, 2, nshp, GA_F_ORDER, 1, -1);
-  %(m)s->ga.offset = off;
+  %(m)s->ga.offset += off;
  assert(dims[0] %% bshp == 0);
  %(m)s->ga.dimensions[0] = dims[0] / bshp;
  %(m)s->ga.dimensions[1] = bshp;
@@ -2362,7 +2362,7 @@ class _RNNSplitParams(DnnBase):
        return code
    def c_code_cache_version(self):
-        return (3, version())
+        return (4, version())
 def _split_rnn_params(w, desc, layer, input_size, dtype, rnn_mode):
@@ -3746,19 +3746,41 @@ def local_dnn_reduction(node):
            node.op.acc_dtype == 'float64'):
        return
+    def _identity(a):
+        return a
+    def _square(a):
+        return GpuElemwise(theano.scalar.basic.sqr)(a)
+    scal = node.op.scalar_op.name
+    post = _identity
    if node.op.pre_scalar_op is not None:
-        # Might want to handle absmax, avg, norm1, norm2 here
+        # Might want to handle absmax, avg, and other cases for (norm1, norm2) here
+        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
+            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
+                scal = 'norm2'
+                post = _square
+            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
+                scal = 'norm1'
+            else:
+                return
+        elif (isinstance(node.op.scalar_op, theano.scalar.basic.Maximum) and
+                isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs)):
+            scal = 'absmax'
+        else:
            return
-    if not cudnn.cudnnReduceTensorOp_t.has_alias(node.op.scalar_op.name):
+    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return
    with inherit_stack_trace(node.outputs):
-        return (GpuDnnReduction(node.op.scalar_op.name,
+        ret = GpuDnnReduction(scal,
                              node.op.axis,
                              node.op.acc_dtype,
                              node.op.dtype,
-                                False)(node.inputs[0]),)
+                              False)(node.inputs[0])
+        return [post(ret)]
 @register_opt('cudnn')

--- a/theano/gpuarray/elemwise.py
+++ b/theano/gpuarray/elemwise.py
@@ -492,6 +492,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
    __props__ = ('axis', 'reduce_mask', 'dtype', 'acc_dtype', 'scalar_op',
                 'pre_scalar_op')
    _f16_ok = True
+    verbose = 0
    def __init__(self, scalar_op, axis=None,
                 reduce_mask=None, dtype=None, acc_dtype=None,
@@ -927,12 +928,12 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y
                + threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ %(acc_type)s buf[];
-                %(acc_type)s myresult = 0;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = 0;
        """ % locals()
-    def _assign_init(self, first_item):
+    def _assign_init(self, first_item, dtype):
        """
        This return the initial value for myresult.
        If the scalar op have an identity value, return it.
@@ -949,7 +950,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                               scalar.Minimum))
            if self.pre_scalar_op:  # TODO: multiple dtypes
                # dtype = node.inputs[0].dtype
-                dtype = 'float32'
                dummy_var = scalar.Scalar(dtype=dtype)()
@@ -1093,6 +1093,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals()
    def c_code_reduce_ccontig(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        if getattr(self.scalar_op, 'identity', None) == 0:
@@ -1121,7 +1122,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
          if(PyGpuArray_SIZE(%(x)s)==0){
            %(zero_shp)s;
          }else{
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t numEls = PyGpuArray_SIZE(%(x)s);
            size_t n_threads = std::min(numEls, (size_t) 256);
            size_t n_blocks = 1;
@@ -1142,10 +1143,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_1(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
            size_t n_blocks[3] = {1, 1, 1};
            %(makecall)s
@@ -1153,10 +1155,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_11(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
            while (n_threads[1] * n_threads[0] <= 256) ++n_threads[1];
@@ -1181,6 +1184,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """
        assert N in [1, 2, 3]
+        verbose = self.verbose
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        makecall = self._makecall(node, name, x, z, fail)
@@ -1222,7 +1226,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[%(N)s], (size_t) 256), 1, 1};
            %(threads_y)s
            %(threads_z)s
@@ -1241,6 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        self.c_code_reduce_01X(sio, node, name, x, z, fail, 3)
    def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
@@ -1256,7 +1261,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        print("""
    {
-        int verbose = 0;
+        int verbose = %(verbose)s;
        if(PyGpuArray_STRIDES(%(x)s)[0]>
           PyGpuArray_STRIDES(%(x)s)[1]){
                // If there are a lot of summations to do, then we can use simple parallelization -
@@ -1334,6 +1339,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_010(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        makecall_inner = self._makecall(node, name, x, z, fail,
                                        pattern="010_inner")
@@ -1400,7 +1406,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            }
            else
            {
-                int verbose = 2;
+                int verbose = %(verbose)s;
                  size_t n_threads[3] = {std::min((size_t) 32, PyGpuArray_DIMS(%(x)s)[2]), 1, 1};
                  while(    (n_threads[0]*(n_threads[1]+1)<=256)
@@ -1442,10 +1448,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_0101(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
            while (n_threads[0] * n_threads[1] <= 256)
            {
@@ -1459,6 +1466,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_100(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
@@ -1477,7 +1485,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        # use blockIdx.y for i2
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            if (PyGpuArray_STRIDES(%(x)s)[2] != sizeof(%(in_dtype)s)){
                size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
                size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t)4096), 1, 1};
@@ -1527,10 +1535,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_110(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[1], (size_t) 256), 1, 1};
            while (n_threads[0]*n_threads[1] <= 256)
            {
@@ -1546,10 +1555,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_001(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
            while (n_blocks[0] * n_blocks[1] <= 4096)
@@ -1564,13 +1574,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_101(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail,
                                  extra_dims=[("size_t one = 1;", "(void *) &one")],
                                  extra_strides=[("ssize_t sone = 1;", "(void *) &sone")],
                                  pattern="1011")
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
 //            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3],
 //                                            (size_t) 256), 1, 1};
            size_t n_threads[3] = {1, 1, 1};
@@ -1592,10 +1603,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_111(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
            //get as many y threads as we can fit
@@ -1624,13 +1636,14 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_0011(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        in_dtype = "npy_" + node.inputs[0].dtype
        out_dtype = "npy_" + node.outputs[0].dtype
        acc_dtype = "npy_" + self._acc_dtype(node.inputs[0].dtype)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 4096), 1, 1};
@@ -1653,10 +1666,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_1111(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[2], (size_t) 256), 1, 1};
            //get as many y threads as we can fit
@@ -1686,10 +1700,11 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        verbose = self.verbose
        makecall = self._makecall(node, name, x, z, fail)
        print("""
        {
-            int verbose = 0;
+            int verbose = %(verbose)s;
            size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[3], (size_t) 256), 1, 1};
            while (n_threads[0] * (n_threads[1]+1) <= 256) ++n_threads[1];
@@ -1708,7 +1723,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        """ % locals(), file=sio)
    def c_code_cache_version_apply(self, node):
-        version = [21]  # the version corresponding to the c code in this Op
+        version = [24, self.verbose]  # the version corresponding to the c code in this Op
        # now we insert versions for the ops on which we depend...
        scalar_node = Apply(
@@ -1729,6 +1744,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
        in_dtype = node.inputs[0].dtype
        out_dtype = node.outputs[0].dtype
        acc_dtype = self._acc_dtype(node.inputs[0].dtype)
+        assign_dtype = in_dtype
        flags = Kernel.get_flags(in_dtype, acc_dtype, out_dtype)
        in_type = gpuarray.dtype_to_ctype(in_dtype)
        out_type = gpuarray.dtype_to_ctype(out_dtype)
@@ -1744,7 +1760,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[0])")
+            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
            kname = "kernel_reduce_ccontig"
            k_var = "kernel_reduce_ccontig_" + nodename
            sio = StringIO()
@@ -1758,9 +1774,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
                extern __shared__ %(acc_type)s buf[];
-                %(acc_type)s myresult = %(reduce_init)s;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = %(reduce_init)s;
                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                {
@@ -1783,7 +1799,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[0])")
+            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
            kname = "kernel_reduce_1"
            k_var = "kernel_reduce_1_" + nodename
            sio = StringIO()
@@ -1798,9 +1814,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
                extern __shared__ %(acc_type)s buf[];
-                %(acc_type)s myresult = %(reduce_init)s;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = %(reduce_init)s;
                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
                {
@@ -1824,7 +1840,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[0])")
+            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
            kname = "kernel_reduce_11"
            k_var = "kernel_reduce_11_" + nodename
            sio = StringIO()
@@ -1839,9 +1855,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
                extern __shared__ %(acc_type)s buf[];
-                %(acc_type)s myresult = %(reduce_init)s;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = %(reduce_init)s;
                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
                {
@@ -1911,7 +1927,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                                      for i in xrange(nd_in)])
            decl, kname, params, k_var = self._k_decl(node, nodename)
            init = self._k_init(node, nodename)
-            reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals())
+            reduce_init = self._assign_init(load_in + "(A[%(first_i3)s * %(sA3)s + %(first_i2)s * %(sA2)s + %(first_i1)s * %(sA1)s + i0 * sA0])" % locals(), assign_dtype)
            reduce_fct = self._assign_reduce(
                node, nodename, "myresult",
                load_in + "(A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0])",
@@ -1948,7 +1964,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])")
+            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2])", assign_dtype)
            kname = "kernel_reduce_010"
            k_var = "kernel_reduce_010_" + nodename
            sio = StringIO()
@@ -1995,7 +2011,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(X[a * sX0 + b * sX1 + c * sX2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])")
+            reduce_init = self._assign_init(load_in + "(X[a * sX0 + 0 * sX1 + c * sX2])", assign_dtype)
            kname = "kernel_reduce_010_AD"
            k_var = "kernel_reduce_010_AD_" + nodename
            sio = StringIO()
@@ -2010,9 +2026,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            {
                const int threadCount = blockDim.x;
                const int threadNum = threadIdx.x;
-                %(acc_type)s myresult = 0;
                X = (const %(in_type)s *)(((char *)X)+offset_X);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = 0;
                for (int a = blockIdx.x; a < A; a += gridDim.x)
                {
@@ -2062,7 +2078,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])")
+            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + 0 * sA1 + i2 * sA2])", assign_dtype)
            sio = StringIO()
            print("""#include "cluda.h"
@@ -2096,7 +2112,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])")
+            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA2])", assign_dtype)
            kname = "kernel_reduce_110"
            k_var = "kernel_reduce_110_" + nodename
            sio = StringIO()
@@ -2112,9 +2128,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                const int threadCount = blockDim.x * blockDim.y;
                const int threadNum = threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ %(acc_type)s buf[];
-                %(acc_type)s myresult = %(reduce_init)s;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = %(reduce_init)s;
                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
                {
@@ -2144,7 +2160,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])")
+            reduce_init = self._assign_init(load_in + "(A[i1 * sA1 + i2 * sA2])", assign_dtype)
            sio = StringIO()
            print("""#include "cluda.h"
@@ -2175,7 +2191,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[0])")
+            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
            sio = StringIO()
            print("""#include "cluda.h"
@@ -2206,7 +2222,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
+            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
            kname = "kernel_reduce_001"
            k_var = "kernel_reduce_001_" + nodename
            sio = StringIO()
@@ -2257,7 +2273,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])")
+            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i1 * sA1])", assign_dtype)
            sio = StringIO()
            print("""#include "cluda.h"
@@ -2294,7 +2310,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])")
+            reduce_init = self._assign_init(load_in + "(A[i0 * sA0 + i2 * sA2])", assign_dtype)
            sio = StringIO()
            print("""#include "cluda.h"
@@ -2329,7 +2345,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[0])")
+            reduce_init = self._assign_init(load_in + "(A[0])", assign_dtype)
            sio = StringIO()
            print("""#include "cluda.h"
@@ -2359,7 +2375,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             load_in + "(A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3])",
                                             {}, True)
-            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])")
+            reduce_init = self._assign_init(load_in + "(A[blockIdx.x * sA1])", assign_dtype)
            kname = "kernel_reduce_1011"
            k_var = "kernel_reduce_1011_" + nodename
            sio = StringIO()
@@ -2375,9 +2391,9 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
                extern __shared__ %(acc_type)s buf[];
-                %(acc_type)s myresult = %(reduce_init)s;
                A = (const %(in_type)s *)(((char *)A)+offset_A);
                Z = (%(out_type)s *)(((char *)Z)+offset_Z);
+                %(acc_type)s myresult = %(reduce_init)s;
                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
                {

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1207,7 +1207,7 @@ def local_gpua_careduce(op, context_name, inputs, outputs):
            return False
        x, = inputs
        idtype = x.dtype
-        adtype = getattr(op, 'acc_dtype', None)
+        adtype = getattr(op, 'acc_dtype', idtype)
        odtype = getattr(op, 'dtype', outputs[0].dtype)
        # Force accumulator to float32 for float32 inputs since tree
@@ -2396,6 +2396,9 @@ def local_gpu_max_pool_rop(op, ctx_name, inputs, outputs):
 def local_gpu_elemwise_careduce(node):
    """
    Merge some GpuCAReduceCuda and GPUElemwise.
+    Currently merged:
+     - SUM(X^2)
+     - SUM(ABS(X))
    """
    if (isinstance(node.op, GpuCAReduceCuda) and
@@ -2406,10 +2409,11 @@ def local_gpu_elemwise_careduce(node):
            # automatically add more case, as some like trigonometic
            # operation with some reduction pattern will probably results
            # in slow down.
-            isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
+            isinstance(node.inputs[0].owner.op.scalar_op, (scalar.basic.Sqr,
+                                                           scalar.basic.Abs))):
        inp = node.inputs[0].owner.inputs[0]
        props = node.op._props_dict()
-        props["pre_scalar_op"] = scalar.basic.sqr
+        props["pre_scalar_op"] = node.inputs[0].owner.op.scalar_op
        with inherit_stack_trace(node.outputs):
            out = GpuCAReduceCuda(**props)(inp)
            return [out]

--- a/theano/gpuarray/tests/test_dnn.py
+++ b/theano/gpuarray/tests/test_dnn.py
@@ -1569,6 +1569,48 @@ def test_dnn_reduction_opt():
        yield dnn_reduction, 2, idtype, adtype, odtype
+def test_dnn_reduction_sum_squares():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = (M**2).sum(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm2'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose((M_val**2).sum(axis=axis), f(M_val))
+def test_dnn_reduction_sum_abs():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = abs(M).sum(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'norm1'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose(np.abs(M_val).sum(axis=axis), f(M_val))
+def test_dnn_reduction_absmax():
+    if not dnn.dnn_available(test_ctx_name) or dnn.version(raises=False) < 6000:
+        raise SkipTest(dnn.dnn_available.msg)
+    M = T.matrix()
+    for axis in (None, 0, 1):
+        out = abs(M).max(axis=axis)
+        f = theano.function([M], out, mode=mode_with_gpu)
+        assert any(isinstance(node.op, dnn.GpuDnnReduction) and node.op.red_op == 'absmax'
+                   for node in f.maker.fgraph.apply_nodes)
+        M_val = np.random.random((4, 5)).astype(theano.config.floatX)
+        utt.assert_allclose(np.max(np.abs(M_val), axis=axis), f(M_val))
 def dnn_reduction_strides(shp, shuffle, slice):
    utt.fetch_seed()
    inp = GpuArrayType('float32', (False,) * len(shp),

--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -360,23 +360,31 @@ def test_pdbbreakpoint_op():
 def test_local_gpu_elemwise_careduce():
+    mode_with_gpu_no_cudnn = mode_with_gpu.excluding('cudnn')
    x = theano.tensor.matrix()
-    o = (x * x).sum()
-    f = theano.function([x], o, mode=mode_with_gpu)
-    topo = f.maker.fgraph.toposort()
-    assert len(topo) == 3
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
-    assert _check_stack_trace(f)
-    data = np.random.rand(3, 4).astype(theano.config.floatX)
-    utt.assert_allclose(f(data), (data * data).sum())
-    o = (x * x).sum(axis=1)
+    def fn_sum_square(x, axis):
-    f = theano.function([x], o, mode=mode_with_gpu)
+        return (x * x).sum(axis=axis)
+    def fn_sum_abs(x, axis):
+        return abs(x).sum(axis=axis)
+    def fn_max_abs(x, axis):
+        return abs(x).max(axis=axis)
+    for fn, pre_scalar_op in ((fn_sum_square, theano.scalar.sqr),
+                              (fn_sum_abs, theano.scalar.abs_),
+                              (fn_max_abs, theano.scalar.abs_)):
+        for axis in (None, 0, 1):
+            o = fn(x, axis)
+            f = theano.function([x], o, mode=mode_with_gpu_no_cudnn)
            topo = f.maker.fgraph.toposort()
            assert len(topo) == 3
-    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+            assert isinstance(topo[1].op, GpuCAReduceCuda)
+            assert topo[1].op.pre_scalar_op == pre_scalar_op
            assert _check_stack_trace(f)
-    utt.assert_allclose(f(data), (data * data).sum(axis=1))
+            data = np.random.rand(3, 4).astype(theano.config.floatX)
+            utt.assert_allclose(fn(data, axis), f(data))
 def test_local_lift_dot22scalar():

--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -20,6 +20,7 @@ from theano.tensor.elemwise import (CAReduce, Elemwise, DimShuffle,
                                    Prod, ProdWithoutZeros)
 from theano.tests import unittest_tools
 from theano.tests.unittest_tools import attr
+import theano.tests.unittest_tools as utt
 def FunctionGraph(i, o):
@@ -482,8 +483,7 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                    try:
                        f_xv = f(xv)
                        self.assertTrue((f_xv.shape == zv.shape), (f_xv, zv))
-                        self.assertTrue(np.allclose(f_xv, zv),
+                        utt.assert_allclose(zv, f_xv)
-                                        (f_xv, zv, xsh, tosum))
                    except NotImplementedError:
                        # GpuCAReduce don't implement all cases when size is 0
                        assert xv.size == 0