Add GpuJoin and GpuSplit to gpuarray.

b740b55f · Arnaud Bergeron · 409552f5 · b740b55f · b740b55f · b740b55f
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -6,7 +6,7 @@ import theano
 from theano import Op, Apply
 from theano import tensor, scalar, config
 from theano.scalar import Scalar
-from theano.tensor.basic import Alloc
+from theano.tensor.basic import Alloc, Join, Split
 from theano.gof.python25 import any
 from theano.gof.utils import MethodNotDefined
@@ -725,6 +725,62 @@ class GpuReshape(HideC, tensor.Reshape):
        out[0] = x.reshape(tuple(shp))
+class GpuJoin(HideC, Join):
+    def make_node(self, axis, *tensors):
+        node = Join.make_node(self, axis, *tensors)
+        return Apply(self, [node.inputs[0]] + map(as_gpuarray_variable,
+                                                  tensors),
+                     [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
+                                   dtype=node.outputs[0].dtype)()])
+    def perform(self, node, axis_and_tensors, out_):
+        out, = out_
+        axis = axis_and_tensors[0]
+        tensors = axis_and_tensors[1:]
+        out[0] = pygpu.concatenate(tensors, axis=axis).astype(
+            node.outputs[0].dtype)
+    def c_code_cache_version(self):
+        return (0,)
+    def c_code(self, node, name, inputs, out_, sub):
+        copy_to_list = []
+        restype=pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
+        for i, inp in enumerate(inputs[1:]):
+            copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
+        return """
+GpuArray **als = (GpuArray **)PyMem_Malloc(sizeof(GpuArray *) * %(n)s);
+if (als == NULL) {
+  PyErr_NoMemory();
+  %(fail)s
+}
+%(copy_inputs_to_list)s
+Py_XDECREF(%(out)s);
+%(out)s = pygpu_concatenate(als, %(n)s, PyInt_AsLong((PyObject *)%(axis)s),
+                            %(restype)s, (PyObject *)&PyGpuArrayType,
+                            pygpu_default_context());
+PyMem_Free(als);
+if (%(out)s == NULL)
+  %(fail)s
+        """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
+                   axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
+                   restype=restype)
+gpu_join = GpuJoin()
+class GpuSplit(HideC, Split):
+    def make_node(self, x, axis, splits):
+        node = Split.make_node(self, x, axis, splits)
+        x = as_gpuarray_variable(x)
+        outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)()
+                for o in node.outputs]
+        return Apply(self, [x] + node.inputs[1:], outs)
+    # we reuse the perform of the CPU op, which is suitable
 class GpuEye(GpuKernelBase, Op):
    def __init__(self, dtype=None):
        if dtype is None:

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -21,7 +21,7 @@ from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
    host_from_gpu, gpu_from_host, HostFromGpu,
-    gpu_alloc, GpuAlloc, GpuReshape, GpuEye
+    gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join,
    )
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
 from theano.sandbox.gpuarray.conv import GpuConv
@@ -267,6 +267,18 @@ def local_gpua_specifyShape(node):
    return tensor.specify_shape
+@register_opt()
+@op_lifter([tensor.Join])
+def local_gpua_join(node):
+    return gpu_join
+@register_opt()
+@op_lifter([tensor.Split])
+def local_gpua_split(node):
+    return GpuSplit(node.op.len_splits)
 @register_opt()
 @op_lifter([tensor.Subtensor])
 def local_gpua_subtensor(node):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -7,7 +7,9 @@ import theano
 import theano.tensor as T
 from theano.tensor import TensorType
 from theano.tensor.basic import alloc
-from theano.tensor.tests.test_basic import rand, safe_make_node, T_reshape
+from theano.tensor.tests.test_basic import (
+    rand, safe_make_node, T_reshape, T_Join_and_Split
+    )
 from theano.tests.unittest_tools import SkipTest
 from numpy.testing.noseclasses import KnownFailureTest
@@ -38,7 +40,7 @@ from theano.sandbox.gpuarray.basic_ops import (
    gpu_from_cuda,
    cuda_from_gpu, HostFromGpu,
    GpuFromHost, GpuReshape,
-    GpuEye)
+    GpuJoin, GpuSplit, GpuEye)
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -339,6 +341,20 @@ class G_reshape(T_reshape):
        assert self.op == GpuReshape
+class G_Join_and_Split(T_Join_and_Split):
+    def setUp(self):
+        super(G_Join_and_Split, self).setUp()
+        self.mode = mode_with_gpu.excluding('constant_folding')
+        self.join_op = GpuJoin
+        self.split_op = GpuSplit
+        # Use join instead of MakeVector since there is no MakeVector on GPU
+        self.make_vector_op = GpuJoin
+        # this is to avoid errors with limited devices
+        self.floatX = 'float32'
+        self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
+        self.shared = gpuarray_shared_constructor
 def test_gpueye():
    def check(dtype, N, M_=None):
        # Theano does not accept None as a tensor.