works now also for vector, tensor3 and tensor4 and for more than 2 inputs + corrected memory bugs

e4242a33 · Ludwig Schmidt-Hackenberg · 403ab069 · e4242a33
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2942,181 +2942,116 @@ class GpuJoin(tensor.Join, GpuOp):
        out[0] = rval
    def c_code(self, node, name, inputs, out_, sub):
-        if node.inputs[0].data not in [0, 1]:
-            raise NotImplementedError()
-            # only works for the first two axis
-        if len(inputs) != 3:
-            # only works for two arrays
-            raise NotImplementedError()
-        if any([i.ndim != 2 for i in node.inputs[1:]]):
-            # only works for type T.matrix
-            raise NotImplementedError()
        axis = inputs[0]
+        n_cndas = len(inputs[1:])
        input_1 = inputs[1]
-        input_2 = inputs[2]
        axis = inputs[0]
        fail = sub['fail']
        out = out_[0]
+        # getting the shapes of all the involved tensors (input[0]+out)
        str = """
        int axis = PyInt_AsLong((PyObject*)%(axis)s);        
        int nd = CudaNdarray_NDIM(%(input_1)s);
+        int shape_%(input_1)s[nd];
+        int shape_out[nd];
-        int dims_array1[nd];
+        for(int i = 0; i<nd; i+=1)
-        int errorcode;
+        {
-        for(int i = 0; i<nd; i+=1){
+            shape_%(input_1)s[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i];
-            dims_array1[i] = CudaNdarray_HOST_DIMS(%(input_1)s)[i];
+            shape_out[i] = shape_%(input_1)s[i];
-        }
-        nd = CudaNdarray_NDIM(%(input_2)s);
-        int dims_array2[nd];
-        for(int i = 0; i<nd; i+=1){
-            dims_array2[i] = CudaNdarray_HOST_DIMS(%(input_2)s)[i];
        }
+        """ % locals()        
-        int dims_out[nd];
+        # getting the shapes of all the involved tensors (input[1:])
-        if(axis==0)
+        # + check: all input tensors have same shape as final out 
+        # execept for "axis" dimension
+        for i, cdna in enumerate(inputs[2:]):
+            str += """
+        nd = CudaNdarray_NDIM(%(cdna)s);
+        int shape_%(cdna)s[nd];
+        for(int i = 0; i<nd; i+=1)
        {
-            dims_out[0] = dims_array1[0] + dims_array2[0];
+            shape_%(cdna)s[i] = CudaNdarray_HOST_DIMS(%(cdna)s)[i];
-            dims_out[1] = dims_array1[1];
+            if((i!=axis) && (shape_%(cdna)s[i]!=shape_out[i]))
-        }
-        if(axis==1)
            {
-            dims_out[0] = dims_array1[0];
+                //(fail)s; //deactivated, because this causes segfault
-            dims_out[1] = dims_array1[1] + dims_array2[1];
            }
-        if (CudaNdarray_prep_output(& %(out)s, 2, dims_out))
-        {
-            %(fail)s;
        }
+            """ % locals()
-        PyObject *slice;
+        # computing the new shape for the out tensors             
-        PyObject *out_sub;
+        str += """
-        PyObject *start, *stop, *step;
+        int width_sum = 0;\n""" % locals()
-        step = NULL;
-        if(axis==0)
-        {
-            start = PyInt_FromLong(0);
-            stop = PyInt_FromLong(dims_array1[0]);
-            slice = PySlice_New(start, stop, step);
-            out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice);
-            errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_1)s);
-            if((slice == NULL) || (out_sub == NULL) || (errorcode != 0))
-            {
-                Py_XDECREF(slice);
-                Py_XDECREF(out_sub);
-                Py_XDECREF(start);
-                Py_XDECREF(stop);
-                Py_XDECREF(step);
-                Py_XDECREF(%(out)s);
-                %(fail)s;
-            }
-            Py_XDECREF(start);
-            Py_XDECREF(slice);
-            Py_XDECREF(out_sub);
-            start = stop;
+        for i, cdna in enumerate(inputs[1:]):
-            stop = PyInt_FromLong(PyInt_AsLong(start) + dims_array2[0]);
+            str += "\t\twidth_sum += CudaNdarray_HOST_DIMS(%(cdna)s)[axis];\n" % locals()
-            slice = PySlice_New(start, stop, step);
+        str += "\t\tshape_out[axis] = width_sum;\n"
-            out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice);
-            errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_2)s);
+        str += """
-            if((slice == NULL) || (out_sub == NULL) || (errorcode != 0))
+        if (CudaNdarray_prep_output(&%(out)s, nd, shape_out))
        {
-                Py_XDECREF(slice);
-                Py_XDECREF(out_sub);
-                Py_XDECREF(start);
-                Py_XDECREF(stop);
-                Py_XDECREF(step);
-                Py_XDECREF(%(out)s);
            %(fail)s;
        }
-            Py_XDECREF(slice);
+        PyObject *out_sub;
-            Py_XDECREF(out_sub);
+        PyObject *start, *stop, *step;
-            Py_XDECREF(start);
+        step = NULL;
-            Py_XDECREF(stop);
+        int errorcode;
-            Py_XDECREF(step);
+        int sum;
-        }
+        sum =0;
-        if(axis==1)
-        {
        PyObject *slice_tuple;
        PyObject *full_slice;
        PyObject *section_slice;
-            PyObject *start_axis2, *stop_axis2;
-            start = PyInt_FromLong(0);
+        """ % locals()
-            stop = PyInt_FromLong(dims_out[0]);
-            stop_axis2 = PyInt_FromLong(dims_array1[1]);
-            slice_tuple = PyTuple_New(2);
-            full_slice = PySlice_New(start, stop, step);
-            section_slice = PySlice_New(start, stop_axis2, step);
-            PyTuple_SetItem(slice_tuple, 0, full_slice);
-            PyTuple_SetItem(slice_tuple, 1, section_slice);
-            out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple);
+        # start copying the data into the new out tensors
-            errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_1)s);
+        for i, cdna in enumerate(inputs[1:]):
-            if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0))
+            str += """
+        sum += shape_%(cdna)s[axis];
+        stop = PyInt_FromLong(sum);
+        slice_tuple = PyTuple_New(nd);
+        full_slice = PySlice_New(NULL, NULL, NULL);
+        section_slice = PySlice_New(start, stop, step);
+        for(int i=0; i<nd; i++)
        {
-                Py_XDECREF(full_slice);
+            if(i!=axis)
-                Py_XDECREF(section_slice);
+            {
-                Py_XDECREF(slice_tuple);
+                Py_INCREF(full_slice);
-                Py_XDECREF(out_sub);
+                PyTuple_SetItem(slice_tuple, i, full_slice);
-                Py_XDECREF(start);
+            }
-                Py_XDECREF(stop);
+            else if(i==axis)
-                Py_XDECREF(step);
+            {
-                Py_XDECREF(start_axis2);
+                Py_INCREF(section_slice);  
-                Py_XDECREF(stop_axis2);
+                PyTuple_SetItem(slice_tuple, i, section_slice);
-                Py_XDECREF(%(out)s);
+            }
-                %(fail)s;
        }
-            Py_XDECREF(stop);
-            Py_XDECREF(full_slice);
-            Py_XDECREF(section_slice);
-            Py_XDECREF(out_sub);
-            Py_XDECREF(slice_tuple);            
-            start_axis2 = stop_axis2;
-            stop = PyInt_FromLong(dims_out[0]);
-            stop_axis2 = PyInt_FromLong(dims_array2[1] + dims_array1[1]);
-            slice_tuple = PyTuple_New(2);
-            full_slice = PySlice_New(start, stop, step);
-            section_slice = PySlice_New(start_axis2, stop_axis2, step);
-            PyTuple_SetItem(slice_tuple, 0, full_slice);
-            PyTuple_SetItem(slice_tuple, 1, section_slice);
        out_sub = CudaNdarray_Subscript((PyObject*)%(out)s, slice_tuple);
-            errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(input_2)s);
+        errorcode = CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)out_sub, %(cdna)s);
        if((full_slice == NULL) || (section_slice == NULL) || (out_sub == NULL) || (errorcode != 0))
        {
            Py_XDECREF(full_slice);
            Py_XDECREF(section_slice);
            Py_XDECREF(slice_tuple);
            Py_XDECREF(out_sub);
-                Py_XDECREF(start);
-                Py_XDECREF(stop);
-                Py_XDECREF(step);
-                Py_XDECREF(start_axis2);
-                Py_XDECREF(stop_axis2);
            Py_XDECREF(%(out)s);
            %(fail)s;
        }
        Py_XDECREF(full_slice);
        Py_XDECREF(section_slice);
-            Py_XDECREF(slice_tuple);
        Py_XDECREF(out_sub);
+        Py_XDECREF(slice_tuple);
+        start = stop;
+            """ % locals()
+            str+="""
        Py_XDECREF(start);
        Py_XDECREF(stop);
-            Py_XDECREF(step);
+        Py_XDECREF(step);"""
-            Py_XDECREF(start_axis2);
-            Py_XDECREF(stop_axis2);
-        }
-        """% locals()
        return str
 gpu_join = GpuJoin()