Merge pull request #2482 from craffel/master

Adding GpuSplit op to cuda submodule

Merge pull request #2482 from craffel/master
85f71330 · Frédéric Bastien · 59225276 · c987822b · 85f71330 · 85f71330
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -288,7 +288,7 @@ if cuda_available:
            GpuDimShuffle, GpuCAReduce, GpuReshape, GpuContiguous,
            GpuSubtensor, GpuIncSubtensor,
            GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1,
-            GpuFlatten, GpuShape, GpuAlloc,
+            GpuFlatten, GpuShape, GpuAlloc, GpuSplit,
            GpuJoin, fscalar, fvector, fmatrix, frow, fcol,
            ftensor3, ftensor4,
            scalar, vector, matrix, row, col,

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -3229,6 +3229,16 @@ class GpuJoin(tensor.Join, GpuOp):
 gpu_join = GpuJoin()


+class GpuSplit(tensor.Split, GpuOp):
+    def make_node(self, x, axis, splits):
+        x = as_cuda_ndarray_variable(x)
+        node = tensor.Split.make_node(self, x, axis, splits)
+        outs = [CudaNdarrayType(dtype=o.dtype,
+                                broadcastable=o.type.broadcastable)()
+                for o in node.outputs]
+        return Apply(self, [x] + node.inputs[1:], outs)
+
+
 class GpuAlloc(GpuOp):
    """Implement Alloc on the gpu.


--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -24,7 +24,7 @@ from theano.sandbox.cuda.basic_ops import (
    GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuFlatten,
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
-    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape)
+    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
@@ -299,6 +299,25 @@ def local_gpu_elemwise_1(node):
    return False


+@register_opt()
+@local_optimizer([tensor.Split, gpu_from_host])
+def local_gpu_split(node):
+    if isinstance(node.op, tensor.Split):
+        input = node.inputs[0]
+        if input.owner and isinstance(input.owner.op, HostFromGpu):
+            new_op = GpuSplit(node.op.len_splits)
+            split_res = new_op(gpu_from_host(input), *node.inputs[1:])
+            return [host_from_gpu(o) for o in split_res]
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op, tensor.Split):
+            split_node = host_input.owner
+            new_op = GpuSplit(split_node.op.len_splits)
+            return [new_op(gpu_from_host(split_node.inputs[0]),
+                           *split_node.inputs[1:])[host_input.index]]
+    return False
+
+
 @register_opt()
 @local_optimizer([tensor.DimShuffle, gpu_from_host])
 def local_gpu_dimshuffle_0(node):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -291,6 +291,27 @@ def test_local_gpu_subtensor():
    assert any([isinstance(node.op, cuda.GpuElemwise) for node in topo])


+def test_local_split():
+    """ Test that the GpuSplit op is being applied and works """
+    # Construct symbolic split
+    x = tensor.fvector()
+    splits = tensor.lvector()
+    ra, rb, rc = tensor.split(x, splits, n_splits=3, axis=0)
+    # Compile function to use CPU
+    f = theano.function([x, splits], [ra, rb, rc], mode=mode_without_gpu)
+    # Get values for CPU version
+    cpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+    l = f.maker.fgraph.toposort()
+    # Ensure that one op is theano.tensor.Split
+    assert any([isinstance(o.op, theano.tensor.Split) for o in l])
+    # GPU version
+    f = theano.function([x, splits], [ra, rb, rc], mode=mode_with_gpu)
+    gpu_res = f([0, 1, 2, 3, 4, 5], [3, 2, 1])
+    l = f.maker.fgraph.toposort()
+    assert any([isinstance(o.op, theano.sandbox.cuda.GpuSplit) for o in l])
+    # Check equality
+    assert all([(cpu == gpu).all() for cpu, gpu in zip(cpu_res, gpu_res)])
+
 def test_print_op():
    """ Test that print ops don't block gpu optimization"""
    b = tensor.fmatrix()

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3213,7 +3213,7 @@ class Split(Op):
        for i in xrange(self.len_splits):
            upper_idx = lower_idx + splits[i]
            general_key[axis] = slice(lower_idx, upper_idx, None)
-            outputs[i][0] = x.__getitem__(general_key).copy()
+            outputs[i][0] = x.__getitem__(tuple(general_key)).copy()
            lower_idx = upper_idx

    def infer_shape(self, node, in_shapes):