Merge pull request #5321 from ReyhaneAskari/fix_5253

unstable commit-fixed perform on gpujoin

Merge pull request #5321 from ReyhaneAskari/fix_5253
2fa3cecb · Frédéric Bastien · GitHub · 09318e0d · 28c3249d · 2fa3cecb
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -1245,8 +1245,19 @@ class GpuJoin(HideC, Join):
    """
    _f16_ok = True
+    __props__ = ("view",)
    params_type = gpu_context_type
+    def __init__(self, view=-1):
+        self.view = view
+        if view != -1:
+            # since the first input is always the axis, the tensors
+            # start from index 1.
+            self.view_map = {0: [1 + view]}
+    def __str__(self):
+        return Join.__str__(self)
    def make_node(self, axis, *tensors):
        node = Join.make_node(self, axis, *tensors)
@@ -1265,42 +1276,64 @@ class GpuJoin(HideC, Join):
    def perform(self, node, axis_and_tensors, out_, ctx):
        out, = out_
+        view = self.view
        axis = int(axis_and_tensors[0])
+        tensors = axis_and_tensors[1:]
        if axis < -axis_and_tensors[1].ndim:
            raise IndexError
        if axis < 0:
            axis += axis_and_tensors[1].ndim
-        tensors = axis_and_tensors[1:]
+        # we check these tensors for being empty.
+        if (view != -1) and numpy.all(
+                [tensor.shape[axis] == 0 for tensor in
+                 tensors[0:view] + tensors[view + 1:]]):
+            out[0] = tensors[view]
+        else:
            out[0] = pygpu.concatenate(tensors, axis=axis, context=ctx).astype(
                node.outputs[0].dtype)
    def c_code_cache_version(self):
-        return (2,)
+        return (3,)
    def c_support_code(self):
        return """
-#if PY_MAJOR_VERSION >= 3
+        #if PY_MAJOR_VERSION >= 3
-#define PyInt_AsLong PyLong_AsLong
+        #define PyInt_AsLong PyLong_AsLong
-#endif
+        #endif
-"""
+        """
+    def c_headers(self):
+        return ['<numpy_compat.h>']
    def c_code(self, node, name, inputs, out_, sub):
+        axis, tensors = inputs[0], inputs[1:]
        copy_to_list = []
        restype = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
-        for i, inp in enumerate(inputs[1:]):
+        view = self.view
+        non_empty_tensor = tensors[view]
+        for i, inp in enumerate(tensors):
            copy_to_list.append("als[%s] = &%s->ga;" % (i, inp))
-        return """
-const GpuArray **als = (const GpuArray **)PyMem_Malloc(sizeof(GpuArray *) *
+        n = len(tensors)
+        fail = sub['fail']
+        out = out_[0]
+        copy_inputs_to_list = '\n'.join(copy_to_list)
+        restype = restype
+        ctx = sub['params']
+        code = """
+        const GpuArray **als = (const GpuArray **)PyMem_Malloc(sizeof(GpuArray *) *
                                                       %(n)s);
-if (als == NULL) {
+        if (als == NULL) {
            PyErr_NoMemory();
            %(fail)s
-}
+        }
-%(copy_inputs_to_list)s
+        %(copy_inputs_to_list)s
-Py_XDECREF(%(out)s);
+        Py_XDECREF(%(out)s);
-{
+        {
-int axis = PyInt_AsLong((PyObject *)%(axis)s);
+            int axis = PyInt_AsLong((PyObject *)%(axis)s);
-if (axis < 0) {
+            if (axis < 0) {
                if (axis == -1 && PyErr_Occurred()) {
                    %(fail)s
                }
@@ -1309,17 +1342,32 @@ if (axis < 0) {
                    PyErr_SetString(PyExc_IndexError, "invalid axis");
                    %(fail)s
                }
-}
+            }
-%(out)s = pygpu_concatenate(als, %(n)s, axis,
+            int tensors_lens_sum;
+            if(%(view)s != -1) {
+                tensors_lens_sum = 0;
+                for(int i=0; i < %(n)s; i++){
+                    tensors_lens_sum += als[i]->dimensions[axis];
+                }
+                tensors_lens_sum -= PyGpuArray_DIM(%(non_empty_tensor)s, axis);
+            }
+            if(%(view)s != -1 && tensors_lens_sum == 0) {
+                Py_INCREF(%(non_empty_tensor)s);
+                %(out)s = %(non_empty_tensor)s;
+            }else{
+                %(out)s = pygpu_concatenate(als, %(n)s, axis,
                                            %(restype)s, (PyObject *)&PyGpuArrayType,
                                            %(ctx)s);
-}
+            }
-PyMem_Free(als);
+        }
-if (%(out)s == NULL)
+        PyMem_Free(als);
+        if (%(out)s == NULL)
            %(fail)s
-        """ % dict(n=len(inputs[1:]), fail=sub['fail'], out=out_[0],
-                   axis=inputs[0], copy_inputs_to_list='\n'.join(copy_to_list),
+        """ % locals()
-                   restype=restype, ctx=sub['params'])
+        return code
 gpu_join = GpuJoin()

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -453,3 +453,24 @@ def test_hostfromgpu_shape_i():
    assert isinstance(topo[1].op, theano.compile.Shape_i)
    assert isinstance(topo[2].op, theano.tensor.opt.MakeVector)
    assert tuple(f(cv)) == (5, 4)
+def test_Gpujoin_inplace():
+    """Test Gpujoin to work inplace.
+    This function tests the case when several elements are passed to the
+    Gpujoin function but all except one of them are empty. In this case
+    Gpujoin should work inplace and the output should be the view of the
+    non-empty element.
+    """
+    s = T.lscalar()
+    data = numpy.array([3, 4, 5], dtype=theano.config.floatX)
+    x = gpuarray_shared_constructor(data, borrow=True)
+    z = T.zeros((s,))
+    join = GpuJoin(view=0)
+    c = join(0, x, z)
+    f = theano.function([s], theano.Out(c, borrow=True))
+    assert x.get_value(borrow=True, return_internal_type=True) is f(0)
+    assert numpy.allclose(f(0), [3, 4, 5])
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3896,9 +3896,12 @@ class Join(Op):
    def __str__(self):
        if self.view == -1:
-            return "Join"
+            return self.__class__.__name__
        else:
-            return super(Join, self).__str__()
+            return "%s{%s}" % (
+                self.__class__.__name__,
+                ", ".join("%s=%r" % (p, getattr(self, p))
+                          for p in self.__props__))
    def __setstate__(self, d):
        self.__dict__.update(d)
@@ -4044,28 +4047,37 @@ class Join(Op):
        out, = outputs
        fail = sub['fail']
        adtype = node.inputs[0].type.dtype_specs()[1]
+        copy_to_list = []
+        for i, inp in enumerate(tensors):
+            copy_to_list.append(
+                """Py_INCREF(%s);
+                   PyList_SetItem(list, %s, (PyObject*)%s);"""
+                % (inp, i, inp))
+        copy_inputs_to_list = '\n'.join(copy_to_list)
+        n = len(tensors)
+        khar = "printf(\"tensors_lens_sum: %d\", tensors_lens_sum);"
        code = """
        int axis = ((%(adtype)s *)PyArray_DATA(%(axis)s))[0];
-        int tensors_lens_sum = 0""" % locals()
+        PyObject* list = PyList_New(%(l)s);
-        for i, inp in enumerate(tensors):
+        %(copy_inputs_to_list)s
-            code += """ + PyArray_DIM(%(inp)s, axis) """ % locals()
+        int tensors_lens_sum;
-        code += """;\n
+        if(%(view)s != -1) {
-        tensors_lens_sum -= PyArray_DIM(%(non_empty_tensor)s, axis);
+            tensors_lens_sum = 0;
-        if(%(view)s != -1 && tensors_lens_sum == 0){
+            for(int i=0; i < %(n)s; i++){
+                tensors_lens_sum += PyArray_DIM((PyArrayObject *)(PyList_GetItem(list, i)), axis);
+            }
+            %(khar)s
+            tensors_lens_sum -= PyArray_DIM(%(non_empty_tensor)s, axis);
+        }
+        if(%(view)s != -1 && tensors_lens_sum == 0) {
            Py_XDECREF(%(out)s);
            Py_INCREF(%(non_empty_tensor)s);
            %(out)s = %(non_empty_tensor)s;
-        }
+        }else{
-        else{
-            PyObject* list = PyList_New(%(l)s);
-        """ % locals()
-        for i, inp in enumerate(tensors):
-            code += """
-            Py_INCREF(%(inp)s);
-            PyList_SetItem(list, %(i)s, (PyObject*)%(inp)s);
-            """ % locals()
-        code += """
            //PyObject* PyArray_Concatenate(PyObject* obj, int axis)
            int ndim = PyArray_NDIM(%(input_1)s);
            if( axis < -ndim ){