Merge pull request #1469 from shackenberg/adding_GpuJoin.c_code

[WIP] Adding GpuJoin.c_code to cuda/basic_ops.py

Merge pull request #1469 from shackenberg/adding_GpuJoin.c_code
f293a0f6 · Frédéric Bastien · b93bb585 · 67b324a6 · f293a0f6 · f293a0f6
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2666,10 +2666,10 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
                      int x_row = d_indices_arr[i];
                      int y_row = i;
                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
-                  } 
+                  }
             }
             return;
-        }      
+        }
 	void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
 	{
@@ -2945,6 +2945,135 @@ class GpuJoin(tensor.Join, GpuOp):
        out[0] = rval
+    def c_code(self, node, name, inputs, out_, sub):
+        nd = node.inputs[1].ndim
+        if not all(i.ndim == nd for i in node.inputs[2:]):
+            # all inputs ndarray need to have the same number of dimensions
+            raise NotImplementedError()
+        axis = inputs[0]
+        n_cndas = len(inputs[1:])
+        input_1 = inputs[1]
+        axis = inputs[0]
+        fail = sub['fail']
+        out = out_[0]
+        # getting the shapes of all the involved tensors (input[0]+out)
+        str = """
+        int axis = PyInt_AsLong((PyObject*)%(axis)s);
+        int nd = %(nd)s;
+        int shape_%(input_1)s[nd];
+        int shape_out[nd];
+        int width_sum = 0;
+        int errorcode;
+        int sum;
+        for(int i = 0; i<nd; i+=1)
+        {
+            shape_%(input_1)s[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i];
+            shape_out[i] = shape_%(input_1)s[i];
+        }
+        """ % locals()
+        # getting the shapes of all the involved tensors (input[1:])
+        # + check: all input tensors have same shape as final out
+        # execept for "axis" dimension
+        # shape_%(cdna)s[nd] is initialized before, to prevent following
+        # error: jump to label __label_9 crosses initialization of
+        # shape_%(cdna)s[nd]
+        for i, cdna in enumerate(inputs[2:]):
+            str += """
+            int shape_%(cdna)s[nd];
+            """ % locals()
+        for i, cdna in enumerate(inputs[2:]):
+            str += """
+            for(int i = 0; i<nd; i+=1)
+            {
+                shape_%(cdna)s[i] = CudaNdarray_HOST_DIMS(%(cdna)s)[i];
+                if((i!=axis) && (shape_%(cdna)s[i]!=shape_out[i]))
+                {
+                    PyErr_Format(
+                        PyExc_ValueError,
+                        "GpuJoin: Wrong inputs for input %%d related"
+                        " to inputs 0.!",
+                        i);
+                    %(fail)s;
+                }
+            }
+            """ % locals()
+        # computing the new shape for the out tensors
+        for i, cdna in enumerate(inputs[1:]):
+            str += "\t\twidth_sum += CudaNdarray_HOST_DIMS(%(cdna)s)[axis];\n" % locals()
+        str += "\t\tshape_out[axis] = width_sum;\n"
+        # preparing the output array + init of the necessary variables
+        # for the data transfer
+        str += """
+        if (CudaNdarray_prep_output(&%(out)s, nd, shape_out))
+        {
+            %(fail)s;
+        }
+        PyObject *slice_tuple;
+        PyObject *section_slice;
+        PyObject *full_slice;
+        full_slice = PySlice_New(NULL, NULL, NULL);
+        PyObject *out_sub;
+        PyObject *start, *stop, *step;
+        start = NULL;
+        stop = NULL;
+        step = NULL;
+        sum = 0;
+        """ % locals()
+        # start copying the data into the new out tensors
+        for i, cdna in enumerate(inputs[1:]):
+            str += """
+            sum += shape_%(cdna)s[axis];
+            Py_XDECREF(stop);
+            stop = PyInt_FromLong(sum);
+            slice_tuple = PyTuple_New(nd);
+            section_slice = PySlice_New(start, stop, step);
+            for(int i=0; i<nd; i++)
+            {
+                if(i!=axis)
+                {
+                    Py_INCREF(full_slice);
+                    PyTuple_SetItem(slice_tuple, i, full_slice);
+                }
+                else
+                {
+                    Py_INCREF(section_slice);
+                    PyTuple_SetItem(slice_tuple, i, section_slice);
+                }
+            }
+            out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple);
+            errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(cdna)s);
+            if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0))
+            {
+                Py_XDECREF(start);
+                Py_XDECREF(stop);
+                Py_XDECREF(step);
+                Py_XDECREF(slice_tuple);
+                Py_XDECREF(out_sub);
+                Py_XDECREF(%(out)s);
+                %(fail)s;
+            }
+            Py_XDECREF(out_sub);
+            Py_XDECREF(slice_tuple);
+            Py_XDECREF(start);
+            start = stop;
+            """ % locals()
+            str+="""
+            Py_XDECREF(start);
+            Py_XDECREF(stop);
+            Py_XDECREF(step);"""
+        return str
+    def c_code_cache_version(self):
+        return (1,)
 gpu_join = GpuJoin()

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -3200,7 +3200,7 @@ class T_Join_and_Split(unittest.TestCase):
    def test_broadcastable_flags_many_dims_and_inputs(self):
        """
-        Test that the right broadcastable flags get set for a  join
+        Test that the right broadcastable flags get set for a join
        with many inputs and many input dimensions.
        """
        a = TensorType(dtype=self.floatX, broadcastable=[1, 0, 1, 0, 0, 0])()