Merge pull request #4401 from nouiz/gpu_contiguous

Opt CpuContiguous -> GpuContiguous and add CpuContiguous.ard

Merge pull request #4401 from nouiz/gpu_contiguous
7e04466d · Pascal Lamblin · 64de6998 · 2f6d63b3 · 7e04466d · 7e04466d
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2280,6 +2280,22 @@ def local_gpu_contiguous_gpu_contiguous(node):
            return [inp]
+@register_opt('fast_compile')
+@local_optimizer([GpuFromHost, tensor.extra_ops.CpuContiguous])
+def local_gpu_contiguous(node):
+    if isinstance(node.op, tensor.extra_ops.CpuContiguous):
+        x, = node.inputs
+        if x.owner and isinstance(x.owner.op, HostFromGpu):
+            gpu_x, = x.owner.inputs
+            return [tensor.as_tensor_variable(gpu_contiguous(gpu_x))]
+    if isinstance(node.op, GpuFromHost):
+        x, = node.inputs
+        if x.owner and isinstance(x.owner.op, tensor.extra_ops.CpuContiguous):
+            gpu_x, = x.owner.inputs
+            return [gpu_contiguous(gpu_x)]
+    return False
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.Eye])
 def local_gpu_eye(node):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -108,6 +108,16 @@ def test_local_gpu_contiguous_gpu_contiguous():
                     if isinstance(node.op, basic_ops.GpuContiguous)])
+def test_local_gpu_contiguous():
+    a = tensor.fmatrix()
+    o = tensor.extra_ops.cpu_contiguous(a)
+    for o in [o, cuda.gpu_from_host(o)]:
+        f = theano.function([a], o, mode=mode_with_gpu)
+        assert 1 == len([node for node in f.maker.fgraph.toposort()
+                         if isinstance(node.op, basic_ops.GpuContiguous)])
+        f([[2.]])
 def test_local_assert_no_cpu_op():
    numpy.random.seed(1)
    m = numpy.random.uniform(-1, 1, (10, 10)).astype("float32")

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -29,7 +29,7 @@ from .type import (GpuArrayType, GpuArrayConstant, get_context,
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        host_from_gpu, GpuToGpu,
                        HostFromGpu, GpuFromHost,
-                        GpuSplit, GpuContiguous,
+                        GpuSplit, GpuContiguous, gpu_contiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
@@ -343,6 +343,12 @@ def local_gpu_contiguous_gpu_contiguous(node):
            return [inp]
+@register_opt('fast_compile')
+@op_lifter([tensor.extra_ops.CpuContiguous])
+def local_gpu_contiguous(node, context_name):
+    return gpu_contiguous
 @register_opt('fast_compile')
 @op_lifter([tensor.Reshape])
 def local_gpureshape(node, context_name):

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -64,6 +64,15 @@ def test_local_gpu_contiguous_gpu_contiguous():
                     if isinstance(node.op, basic_ops.GpuContiguous)])
+def test_local_gpu_contiguous():
+    a = tensor.fmatrix()
+    o = tensor.extra_ops.cpu_contiguous(a)
+    f = theano.function([a], o, mode=mode_with_gpu)
+    assert 1 == len([node for node in f.maker.fgraph.toposort()
+                     if isinstance(node.op, basic_ops.GpuContiguous)])
+    f([[2.]])
 def test_flatten():
    m = theano.tensor.fmatrix()
    f = theano.function([m], m.flatten(), mode=mode_with_gpu)

--- a/theano/tensor/extra_ops.py
+++ b/theano/tensor/extra_ops.py
@@ -35,6 +35,9 @@ class CpuContiguous(theano.Op):
        assert x.flags['C_CONTIGUOUS']
        y[0] = x
+    def grad(self, inputs, dout):
+        return [theano.tensor.as_tensor_variable(dout[0])]
    def c_code(self, node, name, inames, onames, sub):
        x, = inames
        y, = onames

--- a/theano/tensor/tests/test_extra_ops.py
+++ b/theano/tensor/tests/test_extra_ops.py
@@ -25,12 +25,16 @@ def test_cpu_contiguous():
    a = T.fmatrix('a')
    i = T.iscalar('i')
    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
-    f = theano.function([a, i], cpu_contiguous(a.reshape((5,4))[::i]))
+    f = theano.function([a, i], cpu_contiguous(a.reshape((5, 4))[::i]))
    topo = f.maker.fgraph.toposort()
    assert any([isinstance(node.op, CpuContiguous) for node in topo])
    assert f(a_val, 1).flags['C_CONTIGUOUS']
    assert f(a_val, 2).flags['C_CONTIGUOUS']
    assert f(a_val, 3).flags['C_CONTIGUOUS']
+    # Test the grad:
+    theano.tests.unittest_tools.verify_grad(cpu_contiguous,
+                                            [numpy.random.rand(5, 7, 2)])
 class TestCumsumOp(utt.InferShapeTester):