Make opt to use GpuCAReduceCUDA pre_scalar_op with the sqr op.

c5bae0fb · Frederic · daf196e6 · c5bae0fb · c5bae0fb · c5bae0fb
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -22,6 +22,15 @@ from type import GpuArrayType
 def as_gpuarray_variable(x):
+    # This is needed to lower the number of useless transfer
+    # introduced during optimization.  This speed up optimization and
+    # "canonicalize" the graph, so it make easier making some
+    # optimization.
+    if (hasattr(x, 'fgraph') and
+        len(x.clients) == 1 and
+        x.owner and
+        isinstance(x.owner.op, HostFromGpu)):
+        return x.owner.inputs[0]
    if hasattr(x, '_as_GpuArrayVariable'):
        return x._as_GpuArrayVariable()
    # TODO we need to have the cuda -> gpu path taken care of.

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -563,6 +563,27 @@ def local_gpu_conv(node):
    return [out]
+@register_opt("low_memory")
+@local_optimizer([GpuCAReduceCuda])
+def local_gpu_elemwise_careduce(node):
+    """ Merge some GpuCAReduceCuda and GPUElemwise"""
+    if (isinstance(node.op, GpuCAReduceCuda) and
+        node.op.pre_scalar_op is None and
+        node.inputs[0].owner and
+        isinstance(node.inputs[0].owner.op, GpuElemwise) and
+        # The Op support all scalar with 1 inputs.  We don't
+        # automatically add more case, as some like trigonometic
+        # operation with some reduction pattern will probably result
+        # to slow down.
+        isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)
+        ):
+        op = node.op
+        inp = node.inputs[0].owner.inputs[0]
+        return [GpuCAReduceCuda(scalar_op=op.scalar_op,
+                                reduce_mask=op.reduce_mask,
+                                pre_scalar_op=scalar.basic.sqr)(inp)]
 def tensor_to_gpu(x):
    if isinstance(x.type, tensor.TensorType):
        y = GpuArrayType(broadcastable=x.type.broadcastable,

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -133,3 +133,13 @@ def test_print_op():
    assert isinstance(topo[2].op, GpuElemwise)
    assert topo[3].op == host_from_gpu
    f(numpy.random.random((5, 5)).astype('float32'))
+def test_local_gpu_elemwise_careduce():
+    x = theano.tensor.matrix()
+    o = (x*x).sum()
+    f = theano.function([x], o, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    assert len(topo) == 3
+    assert topo[1].op.pre_scalar_op == theano.scalar.sqr
+    f(numpy.random.rand(3, 4).astype(theano.config.floatX))