Make GpuSubtensor and GpuIncSubtensor use the new NumPy C API.

This simplify the *IncSubtensor code at the same time.

Make GpuSubtensor and GpuIncSubtensor use the new NumPy C API.
1392f523 · Frederic · 07068846 · 1392f523 · 1392f523
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2223,12 +2223,6 @@ class GpuReshape(tensor.Reshape, GpuOp):
        out[0] = x.reshape(tuple(shp))


-# C Code shared by GpuSubtensor and GpuIncSubtensor
-_define_set_data = """
-    #define CudaNdarray_set_device_data2(obj, ptr, base) \
-            CudaNdarray_set_device_data(obj, (float *)ptr, base)
-"""
-
 class GpuSubtensor(GpuOp, tensor.Subtensor):
    """
    Implement subtensor on the gpu.
@@ -2276,16 +2270,27 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
        view_ndim = node.outputs[0].ndim
        fail = sub['fail']

+        decl = "CudaNdarray* xview = NULL;"
+
+        get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
+                                       self.idx_list,
+                                       view_ndim=view_ndim,
+                                       c_prefix='CudaNdarray',
+                                       strides_mul=4,
+                                       )
        build_view = """
        //TODO: give this Op a second output so that this view can be cached
        //TODO: alternatively, fix the memory leak on failure
-        CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
+        xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
        if (!xview)
        {
            %(fail)s;
        }
-        if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
-                                       (PyObject*) NULL))
+
+        if (CudaNdarray_set_device_data(
+                xview,
+                CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
+                (PyObject*) %(x)s))
        {
            PyErr_Format(PyExc_RuntimeError,
                         "GpuSubtensor is not able to set the"
@@ -2294,43 +2299,24 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            %(fail)s;
        }
        cnda_mark_dev_structure_dirty(xview);
-        """ % locals()
-
-        get_xview = _define_set_data + \
-                    self.helper_c_code(node, name, inputs, outputs, sub,
-                                       self.idx_list,
-                                       c_prefix='CudaNdarray',
-                                       set_data='CudaNdarray_set_device_data2',
-                                       set_dim='CudaNdarray_set_dim',
-                                       set_stride='CudaNdarray_set_stride',
-                                       update_flags="", strides_mul=4)
-        finish_view = ""
-        #For broadcasted dimensions, set the strides to 0
-        #We can't do that only for broadcasted dimensions as this can happen for dimensions of size 0,
-        #That are rebroadcated later.
-        for idx in range(node.outputs[0].ndim):
-            finish_view += """
-            if(CudaNdarray_HOST_DIMS(xview)[%(idx)s]==1)
-            CudaNdarray_set_stride(xview, %(idx)s, 0);
-            """ % locals()
-
-        finish_view += """
-        //Set the base only now
-
-        if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
-                                    %(x)s)){
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set"
-                         " the base of the view array");
-            Py_XDECREF(xview);
-            %(fail)s;
+        for(int idx=0;idx <%(view_ndim)s; idx++){
+        //For broadcasted dimensions, set the strides to 0
+        //We can't do that only for broadcasted dimensions as this can happen
+        //for dimensions of size 0. That are rebroadcated later.
+            if(xview_dims[idx]==1)
+                CudaNdarray_set_stride(xview, idx, 0);
+            else
+                CudaNdarray_set_stride(xview, idx, xview_strides[idx]);
+            CudaNdarray_set_dim(xview, idx, xview_dims[idx]);
        }
+        """ % locals()

+        finish_view = """
        Py_XDECREF(%(z)s);
        %(z)s = xview;
        """ % locals()

-        return build_view + "{" + get_xview + "}" + finish_view
+        return decl + get_xview + build_view + finish_view

    def c_code_cache_version(self):
        hv = self.helper_c_code_cache_version()
@@ -2719,6 +2705,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):

        """ %locals()

+
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """
    Implement IncSubtensor on the gpu.
@@ -2756,6 +2743,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """
        return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()

+    def decl_view(self):
+        return "CudaNdarray* zview = NULL;"
+
    def make_view_array(self, x, view_ndim):
        """
            :param x: a string identifying an array to be viewed
@@ -2765,17 +2755,32 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
            This doesn't need to actually set up the view with the
            right indexing; we'll do that manually later.
        """
-        return """CudaNdarray* zview = (CudaNdarray*)
-                CudaNdarray_New(%(view_ndim)s)""" % locals()
+        ret = """zview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
+        if (CudaNdarray_set_device_data(
+                zview,
+                CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
+                (PyObject*) %(x)s))
+        {
+            zview = NULL;
+            PyErr_Format(PyExc_RuntimeError,
+                         "GpuSubtensor is not able to set the"
+                         " devdata field of the view");
+        }else{
+            cnda_mark_dev_structure_dirty(zview);
+            for(int idx=0;idx <%(view_ndim)s; idx++){
+                if(xview_dims[idx]==1)
+                    CudaNdarray_set_stride(zview, idx, 0);
+                else
+                    CudaNdarray_set_stride(zview, idx, xview_strides[idx]);
+                CudaNdarray_set_dim(zview, idx, xview_dims[idx]);
+            }
+        }
+        """ % locals()
+        return ret

    def get_helper_c_code_args(self):
        """ Return a dictionary of arguments to use with helper_c_code"""
-        return { 'update_flags' : "",
-                'c_prefix' : 'CudaNdarray',
-                'set_data' :'CudaNdarray_set_device_data2',
-                'set_dim' : 'CudaNdarray_set_dim',
-                'set_stride' : 'CudaNdarray_set_stride',
-                'update_flags' : "",
+        return {'c_prefix': 'CudaNdarray',
                'strides_mul': 4
                }

@@ -2789,24 +2794,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """
        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()

-    def define_set_data(self):
-        return _define_set_data
-
-    def link_view_array(self, x, fail):
-
-        return """
-        if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
-                                       (PyObject*) NULL))
-        {
-            PyErr_Format(PyExc_RuntimeError,
-                         "GpuSubtensor is not able to set the"
-                         " devdata field of the view");
-            Py_XDECREF(zview);
-            %(fail)s;
-        }
-        cnda_mark_dev_structure_dirty(zview);
-        """ % locals()
-
    def set_view_base(self, x, fail):
        return """
        //Set the base only now
@@ -2823,9 +2810,8 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    def add_to_zview(self, x, fail):

        return """
-
-        PyObject * add_result =  CudaNdarray_inplace_add((PyObject *) zview,
-                                                         (PyObject *) py_%(x)s);
+        PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
+                                                        (PyObject *) py_%(x)s);

        if (! add_result )
        {
@@ -2839,7 +2825,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
        """ % locals()

    def c_code_cache_version(self):
-
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if parent_version:
            return parent_version + (0,)

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -1098,6 +1098,9 @@ class IncSubtensor(Op):
                         (x, y) + inputs,
                         [x.type()])

+    def decl_view(self):
+        return "PyArrayObject * zview = NULL;"
+
    def perform(self, node, inputs, out_):
        out, = out_
        x, y = inputs[:2]
@@ -1171,7 +1174,6 @@ class IncSubtensor(Op):
                     numpy.sum([not isinstance(idx, slice)
                                for idx in self.idx_list]))

-        decl = "PyArrayObject * zview = NULL;"
        copy_of_x = self.copy_of_x(x)

        copy_input_if_necessary = """
@@ -1186,15 +1188,11 @@ class IncSubtensor(Op):
        }
        else
        {
-            if (%(z)s) Py_DECREF(%(z)s);
+            Py_XDECREF(%(z)s);
            %(z)s = %(copy_of_x)s;
        }
        """ % locals()

-        alloc_zview = self.make_view_array(z, view_ndim)
-        # On GPU, it takes two steps to make a view
-        link_zview = self.link_view_array(z, fail)
-
        # get info needed to make zview: a view of %(z)s
        helper_args = self.get_helper_c_code_args()

@@ -1210,6 +1208,8 @@ class IncSubtensor(Op):
        )

        #Make a view on the output, as we will write into it.
+        alloc_zview = self.make_view_array(z, view_ndim)
+
        build_view = """
        //TODO: give this Op a second output so that this view can be cached
        //TODO: alternatively, fix the memory leak on failure
@@ -1218,7 +1218,6 @@ class IncSubtensor(Op):
        {
            %(fail)s;
        }
-        %(link_zview)s;
        """ % locals()

        copy_into = self.copy_into("zview", y)
@@ -1239,8 +1238,7 @@ class IncSubtensor(Op):
            %(add_to_zview)s
        }
        """ % locals()
-
-        return (decl +
+        return (self.decl_view() +
                copy_input_if_necessary +
                get_zview +
                build_view +
@@ -1322,19 +1320,6 @@ class IncSubtensor(Op):
        """
        return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()

-    def link_view_array(self, x, fail):
-        """ Returns code to complete making zview a view of x"""
-
-        # On CPU there is nothing to do, make_view_array already did this
-        return ""
-
-    def set_view_base(self, x, fail):
-        """ Returns code to make zview be a correct view of x,
-        after helper_c_code is done messing with x"""
-
-        # On CPU there is nothing to do
-        return ""
-
    def add_to_zview(self, x, fail):
        """ Return C code to add x to zview. Should DECREF zview if the
        add fails."""