Merge pull request #1831 from abergeron/gpuarray_joinsplit

GpuArray join/split

Merge pull request #1831 from abergeron/gpuarray_joinsplit
71014302 · Frédéric Bastien · 31f4377c · 9aaa972a · 71014302 · 71014302
--- a/theano/sandbox/gpuarray/basic_ops.py
+++ b/theano/sandbox/gpuarray/basic_ops.py
@@ -6,7 +6,7 @@ import theano
 from theano import Op, Apply
 from theano import tensor, scalar, config
 from theano.scalar import Scalar
-from theano.tensor.basic import Alloc
+from theano.tensor.basic import Alloc, Join, Split
 from theano.gof.python25 import any
 from theano.gof.utils import MethodNotDefined
@@ -725,6 +725,62 @@ class GpuReshape(HideC, tensor.Reshape):
        out[0] = x.reshape(tuple(shp))
+class GpuJoin(HideC, Join):
+    def make_node(self, axis, *tensors):
+        node = Join.make_node(self, axis, *tensors)
+        return Apply(self, [node.inputs[0]] + map(as_gpuarray_variable,
+                                                  tensors),
+                     [GpuArrayType(broadcastable=node.outputs[0].broadcastable,
+                                   dtype=node.outputs[0].dtype)()])
+    def perform(self, node, axis_and_tensors, out_):
+        out, = out_
+        axis = int(axis_and_tensors[0])
+        tensors = axis_and_tensors[1:]
+        out[0] = pygpu.concatenate(tensors, axis=axis).astype(
+            node.outputs[0].dtype)
+    def c_code_cache_version(self):
+        return (1,)
+    def c_code(self, node, name, inputs, out_, sub):
+        copy_to_list = []
+        restype=pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
+        for i, inp in enumerate(inputs[1:]):
+            copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
+        return """
+GpuArray **als = (GpuArray **)PyMem_Malloc(sizeof(GpuArray *) * %(n)s);
+if (als == NULL) {
+  PyErr_NoMemory();
+  %(fail)s
+}
+%(copy_inputs_to_list)s
+Py_XDECREF(%(out)s);
+%(out)s = pygpu_concatenate(als, %(n)s, PyInt_AsLong((PyObject *)%(axis)s),
+                            %(restype)s, (PyObject *)&PyGpuArrayType,
+                            pygpu_default_context());
+PyMem_Free(als);
+if (%(out)s == NULL)
+  %(fail)s
+        """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
+                   axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
+                   restype=restype)
+gpu_join = GpuJoin()
+class GpuSplit(HideC, Split):
+    def make_node(self, x, axis, splits):
+        node = Split.make_node(self, x, axis, splits)
+        x = as_gpuarray_variable(x)
+        outs = [GpuArrayType(dtype=o.dtype, broadcastable=o.broadcastable)()
+                for o in node.outputs]
+        return Apply(self, [x] + node.inputs[1:], outs)
+    # we reuse the perform of the CPU op, which is suitable
 class GpuEye(GpuKernelBase, Op):
    def __init__(self, dtype=None):
        if dtype is None:

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -21,7 +21,7 @@ from theano.tensor.nnet.conv import ConvOp
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (
    host_from_gpu, gpu_from_host, HostFromGpu,
-    gpu_alloc, GpuAlloc, GpuReshape, GpuEye
+    gpu_alloc, GpuAlloc, GpuReshape, GpuEye, gpu_join, GpuJoin,
    )
 from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
 from theano.sandbox.gpuarray.conv import GpuConv
@@ -152,9 +152,27 @@ optdb['canonicalize'].register('local_cut_gpua_host_gpua',
                               local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
+@register_opt()
+@local_optimizer([tensor.Alloc])
+def local_gpuaalloc2(node):
+    """
+    Join(axis, Alloc, Alloc, ...) -> Join(axis, GpuAlloc, Alloc, ...)
+    Moves an alloc that is an input to join to the gpu.
+    """
+    if (isinstance(node.op, tensor.Alloc) and
+        all(c != 'output' and
+            c.op == tensor.join and
+            all(i.owner and
+                i.owner.op in [host_from_gpu, tensor.alloc]
+                for i in c.inputs[1:])
+            for c, idx in node.outputs[0].clients)):
+        return [host_from_gpu(gpu_alloc(*node.inputs))]
 @register_opt()
 @op_lifter([tensor.Alloc])
-def local_gpualloc(node):
+def local_gpuaalloc(node):
    new_out = gpu_alloc(*node.inputs)
    # We need to hide new broadcastable dimensions because
    # ReplaceValidate doesn't like when they change.
@@ -267,6 +285,26 @@ def local_gpua_specifyShape(node):
    return tensor.specify_shape
+@register_opt()
+@op_lifter([tensor.Join])
+def local_gpua_join(node):
+    return gpu_join
+@register_opt()
+@local_optimizer([GpuJoin])
+def local_gpuajoin_1(node):
+    # join of a single element
+    if (isinstance(node.op, GpuJoin) and
+        len(node.inputs) == 2):
+        return [node.inputs[1]]
+@register_opt()
+@op_lifter([tensor.Split])
+def local_gpua_split(node):
+    return GpuSplit(node.op.len_splits)
 @register_opt()
 @op_lifter([tensor.Subtensor])
 def local_gpua_subtensor(node):

--- a/theano/sandbox/gpuarray/tests/test_basic_ops.py
+++ b/theano/sandbox/gpuarray/tests/test_basic_ops.py
@@ -7,7 +7,9 @@ import theano
 import theano.tensor as T
 from theano.tensor import TensorType
 from theano.tensor.basic import alloc
-from theano.tensor.tests.test_basic import rand, safe_make_node, T_reshape
+from theano.tensor.tests.test_basic import (
+    rand, safe_make_node, T_reshape, T_Join_and_Split
+    )
 from theano.tests.unittest_tools import SkipTest
 from numpy.testing.noseclasses import KnownFailureTest
@@ -16,6 +18,8 @@ import theano.sandbox.gpuarray
 if theano.sandbox.gpuarray.pygpu is None:
    raise SkipTest("pygpu not installed")
+# If you are writing a new test file, don't copy this code, but rather
+# import stuff from this file (like mode_with_gpu) to reuse it.
 import theano.sandbox.cuda as cuda_ndarray
 if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
    if not cuda_ndarray.use.device_number:
@@ -38,7 +42,7 @@ from theano.sandbox.gpuarray.basic_ops import (
    gpu_from_cuda,
    cuda_from_gpu, HostFromGpu,
    GpuFromHost, GpuReshape,
-    GpuEye)
+    gpu_join, GpuJoin, GpuSplit, GpuEye)
 from theano.tests import unittest_tools as utt
 utt.seed_rng()
@@ -339,6 +343,46 @@ class G_reshape(T_reshape):
        assert self.op == GpuReshape
+class G_Join_and_Split(T_Join_and_Split):
+    def setUp(self):
+        super(G_Join_and_Split, self).setUp()
+        self.mode = mode_with_gpu.excluding('constant_folding')
+        self.join_op = GpuJoin
+        self.split_op = GpuSplit
+        # Use join instead of MakeVector since there is no MakeVector on GPU
+        self.make_vector_op = GpuJoin
+        # this is to avoid errors with limited devices
+        self.floatX = 'float32'
+        self.hide_error = theano.config.mode not in ['DebugMode', 'DEBUG_MODE']
+        self.shared = gpuarray_shared_constructor
+def test_gpujoin_gpualloc():
+    a = T.fmatrix('a')
+    a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32')
+    b = T.fmatrix('b')
+    b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
+    f = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
+                        mode=mode_without_gpu)
+    f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)),
+                            mode=mode_with_gpu)
+    f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a),
+                                            T.ones_like(b)) + 4,
+                             mode=mode_with_gpu)
+    assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
+    assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
+    assert sum([isinstance(node.op, GpuAlloc)
+                for node in f_gpu.maker.fgraph.toposort()]) == 2
+    assert sum([node.op == gpu_join
+                for node in f_gpu.maker.fgraph.toposort()]) == 1
+    assert sum([isinstance(node.op, GpuAlloc)
+                for node in f_gpu2.maker.fgraph.toposort()]) == 2
+    assert sum([node.op == gpu_join
+                for node in f_gpu2.maker.fgraph.toposort()]) == 1
+    assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
 def test_gpueye():
    def check(dtype, N, M_=None):
        # Theano does not accept None as a tensor.

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -3448,11 +3448,11 @@ class T_Join_and_Split(unittest.TestCase):
                        [a_val, b_val, c_val, d_val, e_val], rng=rng)
        # Should raise an error if length of dimension 0 is not 1
        bad_val = rng.rand(2, 1, 1, 1, 2, 1).astype(self.floatX)
-        self.assertRaises(TypeError, g, bad_val, b_val, c_val, d_val, e_val)
+        self.assertRaises(TypeError, f, bad_val, b_val, c_val, d_val, e_val)
-        self.assertRaises(TypeError, g, a_val, bad_val, c_val, d_val, e_val)
+        self.assertRaises(TypeError, f, a_val, bad_val, c_val, d_val, e_val)
-        self.assertRaises(TypeError, g, a_val, b_val, bad_val, d_val, e_val)
+        self.assertRaises(TypeError, f, a_val, b_val, bad_val, d_val, e_val)
-        self.assertRaises(TypeError, g, a_val, b_val, c_val, bad_val, e_val)
+        self.assertRaises(TypeError, f, a_val, b_val, c_val, bad_val, e_val)
-        self.assertRaises(TypeError, g, a_val, b_val, c_val, d_val, bad_val)
+        self.assertRaises(TypeError, f, a_val, b_val, c_val, d_val, bad_val)
        # Should raise an error if any dimension other than 4 has length != 1
        bad_a_val = rng.rand(1, 2, 1, 1, 2, 1).astype(self.floatX)
        bad_b_val = rng.rand(1, 1, 1, 1, 2, 2).astype(self.floatX)