Merge pull request #1309 from viveksck/RebasedBranch

REBASED: C code for GPUAdvancedIncSubtensor1.Handles one case to make it faster

Merge pull request #1309 from viveksck/RebasedBranch
346f651f · nouiz · ce640b24 · b4e2f0f8 · 346f651f · 346f651f
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2442,6 +2442,100 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
                    x[i] += y
        out[0] = x
+    def c_code_cache_version(self):
+        return (1,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        if (self.set_instead_of_inc) or \
+           (node.inputs[0].ndim != node.inputs[1].ndim):
+            raise NotImplementedError("This case does not have C code yet.")
+        x = inputs[0]
+        y = inputs[1]
+        ind = inputs[2]
+        out = outputs[0]
+        fail = sub['fail']
+        inplace = int(self.inplace)
+        return """
+        PyObject *x_obj, *y_obj, *row_x, *row_y;
+        PyObject *x_rowind_obj, *y_rowind_obj;
+        dtype_%(ind)s *p_index;
+        int num_indices, j;
+        int ret;
+        num_indices = PyArray_SIZE(%(ind)s);
+        if ((num_indices - 1) > LONG_MAX) {
+            PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices);
+            %(fail)s;
+        }
+        Py_XDECREF(%(out)s);
+        if (!%(inplace)s) {
+            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
+        } else {
+            %(out)s = %(x)s;
+            Py_XINCREF(%(out)s);
+        }
+        x_obj = (PyObject*)CudaNdarray_View(%(out)s);
+        y_obj = (PyObject*)CudaNdarray_View(%(y)s);
+        for (j = 0;j < num_indices; j++) {
+             p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
+             x_rowind_obj = PyInt_FromLong(*p_index);
+             if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
+                 PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long");
+                 // Dec Ref what ever we have increfed or allocated so far
+                 // We deallocate objects exactly in the reverse order they were allocated.
+                 Py_XDECREF(x_rowind_obj);
+                 Py_XDECREF(y_obj);
+                 Py_XDECREF(x_obj);
+                 %(fail)s;
+             }
+             y_rowind_obj = PyInt_FromLong(j);
+             row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
+             row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
+             if ((row_x == NULL) || (row_y == NULL)) {
+                  Py_XDECREF(row_y);
+                  Py_XDECREF(row_x);
+                  Py_XDECREF(y_rowind_obj);
+                  Py_XDECREF(x_rowind_obj);
+                  Py_XDECREF(y_obj);
+                  Py_XDECREF(x_obj);
+                  %(fail)s;
+             }
+             ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
+             if (ret != 0) {
+                 Py_XDECREF(row_y);
+                 Py_XDECREF(row_x);
+                 Py_XDECREF(y_rowind_obj);
+                 Py_XDECREF(x_rowind_obj);
+                 Py_XDECREF(y_obj);
+                 Py_XDECREF(x_obj);
+                 %(fail)s;
+             }
+             Py_XDECREF(row_y);
+             Py_XDECREF(row_x);
+             Py_XDECREF(y_rowind_obj);
+             Py_XDECREF(x_rowind_obj);
+        }
+        Py_XDECREF(y_obj);
+        Py_XDECREF(x_obj);
+        if (!%(out)s) {
+            %(fail)s
+        }
+        """ %locals()
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    """

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -746,15 +746,6 @@ PyObject * CudaNdarray_View(const CudaNdarray * self)
    return (PyObject*)rval;
 }
-enum operator_t
-{
-    IADD=0,
-    IDIV,
-    CPY,
-    N_ELEMWISE_OPS // This is to know the number of operation
-};
 /*
 * d0,... are the output dims
 * indices are a list of index to operate on
@@ -1889,7 +1880,7 @@ CudaNdarray_len(PyObject * py_self)
 }
 // Will by called by __getitem__ in Python
-static PyObject *
+PyObject *
 CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
 {
    int verbose = 0;

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -95,6 +95,15 @@ struct CudaNdarray
    real* devdata; //pointer to data element [0,..,0].
 };
+enum operator_t
+{
+    IADD=0,
+    IDIV,
+    CPY,
+    N_ELEMWISE_OPS // This is to know the number of operation
+};
 /*
 * Return a CudaNdarray whose 'nd' dimensions are all 0.
 * if nd==-1, it is not initialized.
@@ -479,7 +488,8 @@ int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
 DllExport PyObject * CudaNdarray_View(const CudaNdarray * self);
 DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
+DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
+DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
 // Ensures that *arr is a pointer to a contiguous ndarray of the specified