Merge pull request #1888 from nouiz/gpu_sqr_sum_ax0

Add GpuSqrSumAx0 to lower the memory usage on the GPU.

Merge pull request #1888 from nouiz/gpu_sqr_sum_ax0
0d3dffac · abergeron · a4a40a60 · c5bae0fb · 0d3dffac · 0d3dffac
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -503,31 +503,49 @@ class GpuCAReduce(GpuOp):
    GPUs are not especially well-suited to reduction operations so it is
    quite possible that the GPU might be slower for some cases.
+    pre_scalar_op: if present, must be a scalar op with only 1
+    input. We will execute it on the input value before reduction.
    """
-    def __init__(self, reduce_mask, scalar_op):
+    def __init__(self, reduce_mask, scalar_op, pre_scalar_op=None):
        self.reduce_mask = tuple(reduce_mask)
        self.scalar_op = scalar_op
        # used to make sure that calls to scalar op
        # have unique name arguments
        self._n_scalar_op_calls = 0
+        self.pre_scalar_op = pre_scalar_op
+        if pre_scalar_op:
+            assert pre_scalar_op.nin == 1
    def __eq__(self, other):
        return (type(self) == type(other) and
                self.reduce_mask == other.reduce_mask and
-                self.scalar_op == other.scalar_op)
+                self.scalar_op == other.scalar_op and
+                self.pre_scalar_op == other.pre_scalar_op)
    def __hash__(self):
        return (hash(type(self)) ^
                hash(self.reduce_mask) ^
-                hash(type(self.scalar_op)))
+                hash(type(self.scalar_op)) ^
+                hash(type(self.pre_scalar_op)))
    def __str__(self):
-        return "GpuCAReduce{%s}{%s}" % (
+        pre = ""
+        if self.pre_scalar_op:
+            pre = "pre=%s,red=" % str(self.pre_scalar_op)
+        return "GpuCAReduce{%s%s}{%s}" % (
+                pre,
                str(self.scalar_op),
                ','.join(str(i) for i in self.reduce_mask)
                )
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        # For unpickling of old ops.
+        if not hasattr(self, "pre_scalar_op"):
+            self.pre_scalar_op = None
    def make_node(self, x):
        if (x.type.ndim != len(self.reduce_mask)):
            raise TypeError("x must have rank %i" % len(self.reduce_mask))
@@ -889,15 +907,33 @@ class GpuCAReduce(GpuOp):
        else:
            assert isinstance(self.scalar_op, (scal.Maximum,
                                               scal.Minimum))
+            if self.pre_scalar_op:
+                #dtype = node.inputs[0].dtype
+                dtype = 'float32'
+                dummy_var = scal.Scalar(dtype=dtype)()
+                dummy_node = self.pre_scalar_op.make_node(dummy_var)
+                dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
+                self._n_scalar_op_calls += 1
+                t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
+                                              (first_item,), ("",), {})
+                assert t.startswith(' = ')
+                first_item = t[3:]
+                if first_item[-1] == ';':
+                    first_item = first_item[:-1]
            return first_item
-    def _assign_reduce(self, node, name, left, right, sub):
+    def _assign_reduce(self, node, name, left, right, sub, pre):
        """
            node: the node argument to this op's c_code
            name: the name argument to this op's c_code
            left: a C code string identifying an lvalue
            right: a C code string identifying an expression
            sub: the sub argument to this op's c_code
+            pre: If True, we will add the pre_scalar_op.c_code
            returns C code to reduce left and right, assigning the
            result to left."""
@@ -913,7 +949,17 @@ class GpuCAReduce(GpuOp):
        dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
        self._n_scalar_op_calls += 1
+        if pre and self.pre_scalar_op:
+            assert left == "myresult"
+            dummy_node = self.pre_scalar_op.make_node(dummy_left)
+            dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
+            self._n_scalar_op_calls += 1
+            t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
+                                          (right,), ("",), sub)
+            assert t.startswith(' = ')
+            right = t[3:]
+            if right[-1] == ';':
+                right = right[:-1]
        return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
                                     (left,), sub)
@@ -939,7 +985,8 @@ class GpuCAReduce(GpuOp):
        {
            int idx = threadNum - (threadCount >> 1) * 2;"""
-        new_version += self._assign_reduce(node, name, 'buf[idx]','buf[threadNum]', sub)
+        new_version += self._assign_reduce(node, name, 'buf[idx]',
+                                           'buf[threadNum]', sub, False)
        new_version += """
        }
@@ -958,8 +1005,8 @@ class GpuCAReduce(GpuOp):
              float temp = buf[threadNum + halfPoint];
              """
-        new_version += self._assign_reduce(node, name,
+        new_version += self._assign_reduce(node, name, 'buf[threadNum]',
-                                           'buf[threadNum]', 'temp', sub)
+                                           'temp', sub, False)
        new_version += """
            }
@@ -989,8 +1036,8 @@ class GpuCAReduce(GpuOp):
            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
            {
                """
-        current_version += self._assign_reduce(node, name,
+        current_version += self._assign_reduce(node, name, 'myresult',
-                                               'myresult', 'buf[i]', sub) + """
+                                               'buf[i]', sub, False) + """
            }
            buf[threadNum] = myresult;
        /*Comment this optimization as it don't work on Fermi GPU.
@@ -1002,7 +1049,7 @@ class GpuCAReduce(GpuOp):
            current_version += self._assign_reduce(node, name,
                                                   'buf[threadNum]',
                                                   'buf[threadNum+%d]' % num,
-                                                   sub)
+                                                   sub, False)
        current_version += """
                if (threadNum == 0)
                {
@@ -1019,7 +1066,7 @@ class GpuCAReduce(GpuOp):
            this_if = "if (threadNum + %d < threadCount) " % num + \
                self._assign_reduce(node, name,
                                    'buf[threadNum]','buf[threadNum+%d]' % num,
-                                    sub)
+                                    sub, False)
            current_version += this_if
        current_version += """
                if (threadNum == 0)
@@ -1037,7 +1084,8 @@ class GpuCAReduce(GpuOp):
    #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
    #nb_reduce<=warpSize
    def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
-        reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {})
+        reduce_fct = self._assign_reduce(node, name, 'myresult',
+                                         'buf[i]', {}, True)
        return """
        __syncthreads(); // some kernel do multiple reduction.
        buf[threadNum] = myresult;
@@ -1609,7 +1657,7 @@ class GpuCAReduce(GpuOp):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            static __global__ void kernel_reduce_ccontig_%(nodename)s(
@@ -1640,7 +1688,7 @@ class GpuCAReduce(GpuOp):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            static __global__ void kernel_reduce_1_%(nodename)s(
@@ -1671,7 +1719,7 @@ class GpuCAReduce(GpuOp):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
@@ -1755,7 +1803,8 @@ class GpuCAReduce(GpuOp):
            reduce_fct = self._assign_reduce(
                node, nodename, "myresult",
                "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
-                {})
+                {}, True)
            print >> sio, """
                %(decl)s{
                    %(init)s
@@ -1783,7 +1832,7 @@ class GpuCAReduce(GpuOp):
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
            print >> sio, """
            static __global__ void kernel_reduce_010_%(nodename)s(
@@ -1822,7 +1871,7 @@ class GpuCAReduce(GpuOp):
        if self.reduce_mask == (0, 1, 0):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "X[a * sX0 + b * sX1 + c * sX2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
            print >> sio, """
            static __global__ void kernel_reduce_010_AD_%(nodename)s(
@@ -1882,7 +1931,7 @@ class GpuCAReduce(GpuOp):
                                                    'blockDim.x')
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
            print >> sio, """
            %(decl)s
@@ -1918,7 +1967,7 @@ class GpuCAReduce(GpuOp):
            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[blockIdx.x * sA2]")
            print >> sio, """
            static __global__ void kernel_reduce_110_%(nodename)s(
@@ -1959,7 +2008,7 @@ class GpuCAReduce(GpuOp):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
            print >> sio, """
            %(decl)s
@@ -1986,7 +2035,7 @@ class GpuCAReduce(GpuOp):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            %(decl)s
@@ -2013,7 +2062,7 @@ class GpuCAReduce(GpuOp):
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
            print >> sio, """
            static __global__ void kernel_reduce_001_%(nodename)s(
@@ -2056,7 +2105,7 @@ class GpuCAReduce(GpuOp):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
            print >> sio, """
            %(decl)s
@@ -2089,7 +2138,7 @@ class GpuCAReduce(GpuOp):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
            print >> sio, """
            %(decl)s
@@ -2120,7 +2169,7 @@ class GpuCAReduce(GpuOp):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            %(decl)s
@@ -2146,7 +2195,7 @@ class GpuCAReduce(GpuOp):
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[blockIdx.x * sA1]")
            print >> sio, """
            static __global__ void kernel_reduce_1011_%(nodename)s(

--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
 from theano import Op, Apply
 from theano.compat.six import StringIO
-from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda import GpuOp, as_cuda_ndarray_variable
 from theano.sandbox.cuda.kernel_codegen import (nvcc_kernel,
                                                inline_softmax,
                                                inline_softmax_fixed_shared)
-class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
+class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
    """
    Implement CrossentropySoftmaxArgmax1HotWithBias on the gpu.
    """
@@ -216,7 +216,7 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias (GpuOp):
 gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
-class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
+class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
    """
    Implement CrossentropySoftmax1HotWithBiasDx on the gpu.
    """
@@ -364,7 +364,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx (GpuOp):
 gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx()
-class GpuSoftmax (GpuOp):
+class GpuSoftmax(GpuOp):
    """
    Implement Softmax on the gpu.
    """
@@ -483,8 +483,8 @@ class GpuSoftmax (GpuOp):
    def c_support_code_apply(self, node, nodename):
        ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
                params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
+                        'const float * x', 'const int sx0', 'const int sx1',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
@@ -506,8 +506,8 @@ class GpuSoftmax (GpuOp):
                ])
        ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
                params=['int M', 'int N',
-                    'const float * x', 'const int sx0', 'const int sx1',
+                        'const float * x', 'const int sx0', 'const int sx1',
-                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
                body=[
                    "extern __shared__ float buf[]",
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
@@ -525,7 +525,7 @@ class GpuSoftmax (GpuOp):
 gpu_softmax = GpuSoftmax()
-class GpuSoftmaxWithBias (GpuOp):
+class GpuSoftmaxWithBias(GpuOp):
    """
    Implement SoftmaxWithBias on the gpu.
    """
@@ -545,7 +545,7 @@ class GpuSoftmaxWithBias (GpuOp):
        return Apply(self, [x, b], [x.type()])
    def infer_shape(self, node, shape):
-        return  [shape[0]]
+        return [shape[0]]
    def c_code_cache_version(self):
        #return ()
@@ -660,12 +660,13 @@ class GpuSoftmaxWithBias (GpuOp):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
-        ret1 = nvcc_kernel("kSoftmaxWithBias_%s" % nodename,
+        ret1 = nvcc_kernel(
-                params=['int M', 'int N',
+            "kSoftmaxWithBias_%s" % nodename,
-                        'const float * x', 'const int sx0', 'const int sx1',
+            params=['int M', 'int N',
-                        'const float * b', 'const int sb0',
+                    'const float * x', 'const int sx0', 'const int sx1',
-                        'float * sm', 'const int sm_s0', 'const int sm_s1'],
+                    'const float * b', 'const int sb0',
-                body=[
+                    'float * sm', 'const int sm_s0', 'const int sm_s1'],
+            body=[
                    "extern __shared__ float buf[]",
                    "float * buf2 = buf + N",
                    "for (int blockIDX = blockIdx.x; blockIDX < M;"
@@ -683,7 +684,7 @@ class GpuSoftmaxWithBias (GpuOp):
                      "}",
                      "__syncthreads()",
                    "}",
-                    ])
+            ])
        ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
                           params=['int M', 'int N',
                                   'const float * x',

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -684,6 +684,24 @@ def local_gpu_careduce(node):
    return False
+@register_opt("low_memory")
+@local_optimizer([GpuCAReduce])
+def local_gpu_elemwise_careduce(node):
+    if (isinstance(node.op, GpuCAReduce) and
+        node.op.pre_scalar_op is None and
+        node.inputs[0].owner and
+        isinstance(node.inputs[0].owner.op, GpuElemwise) and
+        # The Op support all scalar with 1 inputs.  We don't
+        # automatically add more case, as some like trigonometic
+        # operation with some reduction pattern will probably result
+        # to slow down.
+        isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)
+        ):
+        op = node.op
+        inp = node.inputs[0].owner.inputs[0]
+        return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Reshape])
 def local_gpu_reshape(node):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -60,6 +60,10 @@ def test_careduce():
    1110,1101,1011
    TODO: test with broadcast
+    We test with the pre_scalar_op sqr in all cases. This cover all
+    code, with and without it the pre_scalar_op.
    """
    for scalar_op, careduce_op in [
            (theano.scalar.mul, tensor.elemwise.CAReduceDtype),
@@ -132,7 +136,7 @@ def test_careduce():
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
            a = tensor.TensorType('float32', (False,) * len(shape))()
-            b = op(a)
+            b = op(a*a)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -142,6 +146,10 @@ def test_careduce():
            assert tcn.GpuCAReduce in [x.op.__class__
                                       for x in f.maker.fgraph.toposort()], (
                                           scalar_op, shape, pattern)
+            if tcn.GpuElemwise in [x.op.__class__
+                                   for x in f.maker.fgraph.toposort()]:
+                assert tcn.GpuReshape in [x.op.__class__
+                                          for x in f.maker.fgraph.toposort()]
            assert op.__class__ in [x.op.__class__
                                    for x in f2.maker.fgraph.toposort()], (
                                           scalar_op, shape, pattern)
@@ -210,7 +218,7 @@ def test_careduce():
            dim_pattern[0] = 1
            dim_pattern[1] = 0
            a = a.dimshuffle(dim_pattern)
-            b = op(a)
+            b = op(a*a)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -220,6 +228,8 @@ def test_careduce():
            assert tcn.GpuCAReduce in [x.op.__class__
                                       for x in f.maker.fgraph.toposort()], (
                                           scalar_op, shape, pattern)
+            assert tcn.GpuElemwise not in [x.op.__class__
+                                           for x in f.maker.fgraph.toposort()]
            assert op.__class__ in [x.op.__class__
                                    for x in f2.maker.fgraph.toposort()], (
                                           scalar_op, shape, pattern)
@@ -242,8 +252,8 @@ def test_careduce():
            shape = numpy.asarray(shape) * 2
            a = tensor.TensorType('float32', (False,) * len(shape))()
            a2 = tcn.CudaNdarrayType((False,) * len(shape))()
-            b = op(a)
+            b = op(a*a)
-            b2 = op(a2)
+            b2 = op(a2*a2)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
@@ -266,6 +276,8 @@ def test_careduce():
            assert tcn.GpuCAReduce in [x.op.__class__
                                       for x in f2.maker.fgraph.toposort()], (
                                           scalar_op, shape, pattern)
+            assert tcn.GpuElemwise not in [x.op.__class__
+                                           for x in f.maker.fgraph.toposort()]
            assert op.__class__ in [x.op.__class__
                                    for x in f.maker.fgraph.toposort()], (
                                           scalar_op, shape, pattern)

--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -22,6 +22,15 @@ from type import GpuArrayType
 def as_gpuarray_variable(x):
+    # This is needed to lower the number of useless transfer
+    # introduced during optimization.  This speed up optimization and
+    # "canonicalize" the graph, so it make easier making some
+    # optimization.
+    if (hasattr(x, 'fgraph') and
+        len(x.clients) == 1 and
+        x.owner and
+        isinstance(x.owner.op, HostFromGpu)):
+        return x.owner.inputs[0]
    if hasattr(x, '_as_GpuArrayVariable'):
        return x._as_GpuArrayVariable()
    # TODO we need to have the cuda -> gpu path taken care of.

--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -570,10 +570,14 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    GPUs are not especially well-suited to reduction operations so it is
    quite possible that the GPU might be slower for some cases.
+    pre_scalar_op: if present, must be a scalar op with only 1
+    input. We will execute it on the input value before reduction.
    """
    def __init__(self, scalar_op, axis=None,
-                 reduce_mask=None, dtype=None, acc_dtype=None):
+                 reduce_mask=None, dtype=None, acc_dtype=None,
+                 pre_scalar_op=None):
        if reduce_mask is not None:
            reduce_mask = tuple(reduce_mask)
        self.reduce_mask = reduce_mask
@@ -583,6 +587,9 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        self._n_scalar_op_calls = 0
        CAReduceDtype.__init__(self, scalar_op, axis=axis,
                               dtype=dtype, acc_dtype=acc_dtype)
+        self.pre_scalar_op = pre_scalar_op
+        if pre_scalar_op:
+            assert pre_scalar_op.nin == 1
    def __eq__(self, other):
        return (type(self) == type(other) and
@@ -590,7 +597,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                self.reduce_mask == other.reduce_mask and
                self.dtype == other.dtype and
                self.acc_dtype == other.acc_dtype and
-                self.scalar_op == other.scalar_op)
+                self.scalar_op == other.scalar_op and
+                self.pre_scalar_op == other.pre_scalar_op)
    def __hash__(self):
        return (hash(type(self)) ^
@@ -598,19 +606,35 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                hash(self.reduce_mask) ^
                hash(self.dtype) ^
                hash(self.acc_dtype) ^
-                hash(type(self.scalar_op)))
+                hash(type(self.scalar_op)) ^
+                hash(type(self.pre_scalar_op)))
    def __str__(self):
+        pre = ""
+        if self.pre_scalar_op:
+            pre = "pre=%s,red=" % str(self.pre_scalar_op)
        ax = ''
        if self.axis is not None:
            ax = '{%s}' % (', '.join(str(x) for x in self.axis),)
-        return "GpuCAReduceCuda{%s}%s" % (str(self.scalar_op), ax)
+        return "GpuCAReduceCuda{%s%s}%s" % (pre,str(self.scalar_op), ax)
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        # For unpickling of old ops.
+        if not hasattr(self, "pre_scalar_op"):
+            self.pre_scalar_op = None
    def make_node(self, x):
        x = as_gpuarray_variable(x)
        ret = super(GpuCAReduceCuda, self).make_node(x)
        self = copy.copy(self)
        self.axis = ret.op.axis
+        if self.pre_scalar_op:
+            # Currently we only tested pre_scalar_op that don't cause
+            # upcast.
+            d1 = self.__class__(scalar_op=self.scalar_op)(Elemwise(self.pre_scalar_op)(x))
+            assert d1.dtype == ret.outputs[0].dtype
+            assert Elemwise(self.pre_scalar_op)(x).dtype == x.dtype
        if self.reduce_mask is None:
            if self.axis is None:
                reduce_mask = [1] * x.type.ndim
@@ -1010,15 +1034,33 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        else:
            assert isinstance(self.scalar_op, (scal.Maximum,
                                               scal.Minimum))
+            if self.pre_scalar_op: # TODO, multi_dtype!
+                #dtype = node.inputs[0].dtype
+                dtype = 'float32'
+                dummy_var = scal.Scalar(dtype=dtype)()
+                dummy_node = self.pre_scalar_op.make_node(dummy_var)
+                dummy_name = 'assign_init_pre_scalar_op' + str(self._n_scalar_op_calls)
+                self._n_scalar_op_calls += 1
+                t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
+                                              (first_item,), ("",), {})
+                assert t.startswith(' = ')
+                first_item = t[3:]
+                if first_item[-1] == ';':
+                    first_item = first_item[:-1]
            return first_item
-    def _assign_reduce(self, node, name, left, right, sub):
+    def _assign_reduce(self, node, name, left, right, sub, pre):
        """
            node: the node argument to this op's c_code
            name: the name argument to this op's c_code
            left: a C code string identifying an lvalue
            right: a C code string identifying an expression
            sub: the sub argument to this op's c_code
+            pre: If True, we will add the pre_scalar_op.c_code
            returns C code to reduce left and right, assigning the
            result to left."""
@@ -1035,6 +1077,18 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
        self._n_scalar_op_calls += 1
+        if pre and self.pre_scalar_op:
+            assert left == "myresult"
+            dummy_node = self.pre_scalar_op.make_node(dummy_left)
+            dummy_name = name + '_scalar_op' + str(self._n_scalar_op_calls)
+            self._n_scalar_op_calls += 1
+            t = self.pre_scalar_op.c_code(dummy_node, dummy_name,
+                                          (right,), ("",), sub)
+            assert t.startswith(' = ')
+            right = t[3:]
+            if right[-1] == ';':
+                right = right[:-1]
        return self.scalar_op.c_code(dummy_node, dummy_name, (left, right),
                                     (left,), sub)
@@ -1064,7 +1118,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            int idx = threadNum - (threadCount >> 1) * 2;"""
        new_version += self._assign_reduce(node, name, 'buf[idx]',
-                                           'buf[threadNum]', sub)
+                                           'buf[threadNum]', sub, False)
        new_version += """
        }
@@ -1084,7 +1138,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
              """
        new_version += self._assign_reduce(node, name,
-                                           'buf[threadNum]', 'temp', sub)
+                                           'buf[threadNum]', 'temp', sub, False)
        new_version += """
            }
@@ -1115,7 +1169,8 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            {
                """
        current_version += self._assign_reduce(node, name,
-                                               'myresult', 'buf[i]', sub) + """
+                                               'myresult', 'buf[i]',
+                                               sub, False) + """
            }
            buf[threadNum] = myresult;
        /*Comment this optimization as it don't work on Fermi GPU.
@@ -1127,7 +1182,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            current_version += self._assign_reduce(node, name,
                                                   'buf[threadNum]',
                                                   'buf[threadNum+%d]' % num,
-                                                   sub)
+                                                   sub, False)
            current_version += """
            """
        current_version += """
@@ -1146,7 +1201,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            this_if = "if (threadNum + %d < threadCount) " % num + \
                self._assign_reduce(node, name,
                                    'buf[threadNum]','buf[threadNum+%d]' % num,
-                                    sub)
+                                    sub, False)
            current_version += this_if
            current_version += """
            """
@@ -1166,7 +1221,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
    #Threads must be organized as: threadNum%nb_reduce correspond to the same sum
    #nb_reduce<=warpSize
    def _k_reduce_buf_multiple(self, z_pos, node, name, nb_reduce):
-        reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {})
+        reduce_fct = self._assign_reduce(node, name, 'myresult', 'buf[i]', {}, False)
        return """
        __syncthreads(); // some kernel do multiple reduction.
        buf[threadNum] = myresult;
@@ -1767,7 +1822,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            static __global__ void kernel_reduce_ccontig_%(nodename)s(
@@ -1798,7 +1853,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            static __global__ void kernel_reduce_1_%(nodename)s(
@@ -1829,7 +1884,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            reducebuf = self._k_reduce_buf('Z[0]', node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
@@ -1913,7 +1968,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            reduce_fct = self._assign_reduce(
                node, nodename, "myresult",
                "A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0]",
-                {})
+                {}, True)
            print >> sio, """
                %(decl)s{
                    %(init)s
@@ -1941,7 +1996,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + threadIdx.x * sA1 + i2 * sA2]")
            print >> sio, """
            static __global__ void kernel_reduce_010_%(nodename)s(
@@ -1980,7 +2035,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
        if self.reduce_mask == (0, 1, 0):
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "X[a * sX0 + b * sX1 + c * sX2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("X[a * sX0 + 0 * sX1 + c * sX2]")
            print >> sio, """
            static __global__ void kernel_reduce_010_AD_%(nodename)s(
@@ -2040,7 +2095,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                                    'blockDim.x')
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + 0 * sA1 + i2 * sA2]")
            print >> sio, """
            %(decl)s
@@ -2076,7 +2131,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]', node, nodename, sub = {})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + blockIdx.x * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[blockIdx.x * sA2]")
            print >> sio, """
            static __global__ void kernel_reduce_110_%(nodename)s(
@@ -2117,7 +2172,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i1 * sA1 + i2 * sA2]")
            print >> sio, """
            %(decl)s
@@ -2144,7 +2199,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            %(decl)s
@@ -2171,7 +2226,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
            print >> sio, """
            static __global__ void kernel_reduce_001_%(nodename)s(
@@ -2214,7 +2269,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + i1 * sA1]")
            print >> sio, """
            %(decl)s
@@ -2247,7 +2302,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[i0 * sA0 + i2 * sA2]")
            print >> sio, """
            %(decl)s
@@ -2278,7 +2333,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
            init = self._k_init(node, nodename)
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + i1 * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[0]")
            print >> sio, """
            %(decl)s
@@ -2304,7 +2359,7 @@ class GpuCAReduceCuda(HideC, CAReduceDtype):
                                           node, nodename, sub={})
            reduce_fct = self._assign_reduce(node, nodename, "myresult",
                                             "A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3]",
-                                             {})
+                                             {}, True)
            reduce_init = self._assign_init("A[blockIdx.x * sA1]")
            print >> sio, """
            static __global__ void kernel_reduce_1011_%(nodename)s(

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -563,6 +563,27 @@ def local_gpu_conv(node):
    return [out]
+@register_opt("low_memory")
+@local_optimizer([GpuCAReduceCuda])
+def local_gpu_elemwise_careduce(node):
+    """ Merge some GpuCAReduceCuda and GPUElemwise"""
+    if (isinstance(node.op, GpuCAReduceCuda) and
+        node.op.pre_scalar_op is None and
+        node.inputs[0].owner and
+        isinstance(node.inputs[0].owner.op, GpuElemwise) and
+        # The Op support all scalar with 1 inputs.  We don't
+        # automatically add more case, as some like trigonometic
+        # operation with some reduction pattern will probably result
+        # to slow down.
+        isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
+        ):
+        op = node.op
+        inp = node.inputs[0].owner.inputs[0]
+        return [GpuCAReduceCuda(scalar_op=op.scalar_op,
+                                reduce_mask=op.reduce_mask,
+                                pre_scalar_op=scalar.basic.sqr)(inp)]
 def tensor_to_gpu(x):
    if isinstance(x.type, tensor.TensorType):
        y = GpuArrayType(broadcastable=x.type.broadcastable,

--- a/theano/sandbox/gpuarray/tests/test_elemwise.py
+++ b/theano/sandbox/gpuarray/tests/test_elemwise.py
@@ -40,11 +40,13 @@ class test_GpuCAReduceCPY(test_CAReduce):
    bin_dtypes = ["uint8", "int8"]
    op = GpuCAReduceCPY
    reds = [scalar.add, scalar.mul]
+    pre_scalar_op = None
    def test_perform(self):
        for dtype in self.dtypes + self.bin_dtypes:
            for op in self.reds:
-                self.with_linker(gof.PerformLinker(), op, dtype=dtype)
+                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
+                                 pre_scalar_op=self.pre_scalar_op)
    def test_perform_nan(self):
        for dtype in self.dtypes:
@@ -52,12 +54,14 @@ class test_GpuCAReduceCPY(test_CAReduce):
                continue
            for op in self.reds:
                self.with_linker(gof.PerformLinker(), op, dtype=dtype,
-                                 test_nan=True)
+                                 test_nan=True,
+                                 pre_scalar_op=self.pre_scalar_op)
    def test_c(self):
        for dtype in self.dtypes + self.bin_dtypes:
            for op in self.reds:
-                self.with_linker(gof.CLinker(), op, dtype=dtype)
+                self.with_linker(gof.CLinker(), op, dtype=dtype,
+                                 pre_scalar_op=self.pre_scalar_op)
    def test_c_nan(self):
        for dtype in self.dtypes:
@@ -65,7 +69,8 @@ class test_GpuCAReduceCPY(test_CAReduce):
                continue
            for op in self.reds:
                self.with_linker(gof.CLinker(), op, dtype=dtype,
-                                 test_nan=True)
+                                 test_nan=True,
+                                 pre_scalar_op=self.pre_scalar_op)
    def test_infer_shape(self):
        for dtype in self.dtypes:
@@ -148,6 +153,7 @@ class test_GpuCAReduceCuda(test_GpuCAReduceCPY):
    op = GpuCAReduceCuda
    reds = [scalar.add, scalar.mul,
            scalar.maximum, scalar.minimum]
+    pre_scalar_op = scalar.sqr
    def test_perform(self):
        return

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -133,3 +133,13 @@ def test_print_op():
    assert isinstance(topo[2].op, GpuElemwise)
    assert topo[3].op == host_from_gpu
    f(numpy.random.random((5, 5)).astype('float32'))
+def test_local_gpu_elemwise_careduce():
+    x = theano.tensor.matrix()
+    o = (x*x).sum()
+    f = theano.function([x], o, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    f(numpy.random.rand(3, 4).astype(theano.config.floatX))
--- a/theano/tensor/tests/test_elemwise.py
+++ b/theano/tensor/tests/test_elemwise.py
@@ -308,15 +308,19 @@ class test_CAReduce(unittest_tools.InferShapeTester):
         ]
    def with_linker(self, linker, scalar_op=scalar.add, dtype="floatX",
+                    pre_scalar_op=None,
                    test_nan=False, tensor_op=None):
        for xsh, tosum in self.cases:
            if dtype == "floatX":
                dtype = theano.config.floatX
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
+            d = {}
+            if pre_scalar_op is not None:
+                d = {"pre_scalar_op": pre_scalar_op}
            if tensor_op is None:
-                e = as_tensor_variable(self.op(scalar_op, axis=tosum)(x))
+                e = as_tensor_variable(self.op(scalar_op, axis=tosum, **d)(x))
            else:
-                e = as_tensor_variable(tensor_op(x, axis=tosum))
+                e = as_tensor_variable(tensor_op(x, axis=tosum, **d))
            if tosum is None:
                tosum = range(len(xsh))
@@ -337,6 +341,8 @@ class test_CAReduce(unittest_tools.InferShapeTester):
                else:
                    xv = numpy.asarray(numpy.nan, dtype=dtype)
            zv = xv
+            if pre_scalar_op is not None:
+                zv = Elemwise(scalar_op=pre_scalar_op)(x).eval({x: xv})
            numpy_raised = False
            if len(tosum) > 1 and any([a < 0 for a in tosum]):
                #In that case, we need to use the good order of axis
@@ -505,16 +511,22 @@ class test_CAReduce(unittest_tools.InferShapeTester):
            self.with_linker(gof.CLinker(), scalar.maximum, dtype=dtype,
                             test_nan=True)
-    def test_infer_shape(self, dtype=None):
+    def test_infer_shape(self, dtype=None, pre_scalar_op=None):
        if dtype is None:
            dtype = theano.config.floatX
        for xsh, tosum in self.cases:
            x = TensorType(dtype, [(entry == 1) for entry in xsh])('x')
+            if pre_scalar_op is not None:
+                x = pre_scalar_op(x)
            if tosum is None:
                tosum = range(len(xsh))
            xv = numpy.asarray(numpy.random.rand(*xsh), dtype=dtype)
+            d = {}
+            if pre_scalar_op is not None:
+                xv = x.eval({x.owner.inputs[0]: xv})
+                d = {pre_scalar_op: pre_scalar_op}
            self._compile_and_check([x],
-                                    [self.op(scalar.add, axis=tosum)(x)],
+                                    [self.op(scalar.add, axis=tosum, *d)(x)],
                                    [xv], self.op,
                                    ["local_cut_useless_reduce"],
                                    warn=0 not in xsh)