Check in works

61ba61d5 · Vivek Kulkarni · 346f651f · 61ba61d5 · 61ba61d5 · 61ba61d5
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2458,18 +2458,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        inplace = int(self.inplace)

        return """
-        PyObject *x_obj, *y_obj, *row_x, *row_y;
-        PyObject *x_rowind_obj, *y_rowind_obj;
-        dtype_%(ind)s *p_index;
-        int num_indices, j;
-        int ret;
-
-        num_indices = PyArray_SIZE(%(ind)s);
-        if ((num_indices - 1) > LONG_MAX) {
-            PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices);
-            %(fail)s;
-        }
-        
        Py_XDECREF(%(out)s);
        if (!%(inplace)s) {
            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
@@ -2478,59 +2466,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
            Py_XINCREF(%(out)s);
        }

-        x_obj = (PyObject*)CudaNdarray_View(%(out)s);
-        y_obj = (PyObject*)CudaNdarray_View(%(y)s);
-
-        for (j = 0;j < num_indices; j++) {
-
-             p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
-
-             x_rowind_obj = PyInt_FromLong(*p_index);
-
-             if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
-                 PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long");
-                 // Dec Ref what ever we have increfed or allocated so far
-                 // We deallocate objects exactly in the reverse order they were allocated.
-                 Py_XDECREF(x_rowind_obj);
-                 Py_XDECREF(y_obj);
-                 Py_XDECREF(x_obj);
-                 %(fail)s;
-             }
-
-             y_rowind_obj = PyInt_FromLong(j);
-
-             row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
-             row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
-
-             if ((row_x == NULL) || (row_y == NULL)) {
-                  Py_XDECREF(row_y);
-                  Py_XDECREF(row_x);
-                  Py_XDECREF(y_rowind_obj);
-                  Py_XDECREF(x_rowind_obj);
-                  Py_XDECREF(y_obj);
-                  Py_XDECREF(x_obj);
-                  %(fail)s;
-             }
-             
-             ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
-             if (ret != 0) {
-                 Py_XDECREF(row_y);
-                 Py_XDECREF(row_x);
-                 Py_XDECREF(y_rowind_obj);
-                 Py_XDECREF(x_rowind_obj);
-                 Py_XDECREF(y_obj);
-                 Py_XDECREF(x_obj);
-                 %(fail)s;
-             }
-
-             Py_XDECREF(row_y);
-             Py_XDECREF(row_x);
-             Py_XDECREF(y_rowind_obj);
-             Py_XDECREF(x_rowind_obj);
-        }
-
-        Py_XDECREF(y_obj);
-        Py_XDECREF(x_obj);
+        CudaNdarray_vector_add_fast(%(x)s, %(y)s, %(ind)s);

        if (!%(out)s) {
            %(fail)s

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1326,6 +1326,28 @@ CudaNdarray_add(PyObject* py_self, PyObject * py_other)
    return (PyObject *) rval;
 }

+__global__ void k_vector_add_fast(int numRowsX,
+                                  int numColsX,
+                                  int stridesX0,
+                                  int stridesX1,
+                                  float *X,
+                                  int numRowsY,
+                                  int numColsY,
+                                  int stridesY0,
+                                  int stridesY1,
+                                  float *Y ,
+                                  long *d_indices_arr,
+                                  int num)
+{
+     int i = (blockIdx.x);
+     int j = (threadIdx.x);
+     int x_row = d_indices_arr[i];
+     int y_row = i;
+     atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
+     return;
+}
+
+
 template <int operator_num>
 __global__ void k_ielem_3(const int d0, const int d1, const int d2,
        float* a, const int sA0, const int sA1, const int sA2,
@@ -1776,6 +1798,46 @@ CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t
    return 0;
 }

+void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
+{
+     const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
+     const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
+     const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
+     const int *strY   = CudaNdarray_HOST_STRIDES(py_other);
+     unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
+     unsigned int num_threads_per_block = shapeY[1];
+     unsigned int num_blocks = size;
+     dim3 n_blocks(num_blocks);
+     dim3 n_threads(num_threads_per_block);
+     static long *d_indices_arr = NULL;
+     if (!d_indices_arr)
+     {
+         d_indices_arr = (long *)device_malloc(sizeof(long) * PyArray_SIZE(indices_arr));
+     }
+     assert(d_indices_arr);
+
+     cudaError_t err = cudaMemcpy(d_indices_arr,
+                                  PyArray_DATA(indices_arr) ,
+                                  sizeof(long) * PyArray_SIZE(indices_arr),
+                                  cudaMemcpyHostToDevice);
+
+     k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
+                                                shapeX[1],
+                                                strX[0],
+                                                strX[1],
+                                                CudaNdarray_DEV_DATA(py_self),
+                                                shapeY[0],
+                                                shapeY[1],
+                                                strY[0],
+                                                strY[1],
+                                                CudaNdarray_DEV_DATA(py_other),
+                                                d_indices_arr,
+                                                PyArray_SIZE(indices_arr)
+                                                );
+     return;
+}
+
+
 /*
 * We need this inplace Add to support IncSubTensor
 * It returns py_self on success with an additional reference. Else NULL.

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -490,7 +490,7 @@ DllExport PyObject * CudaNdarray_View(const CudaNdarray * self);
 DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other);
 DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
 DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
-
+DllExport void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray *py_other, PyArrayObject *indices_arr);

 // Ensures that *arr is a pointer to a contiguous ndarray of the specified
 // dimensions.

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1353,6 +1353,19 @@ def local_gpualloc_memset_0(node):
            new_out = GpuAlloc(memset_0=True)(*node.inputs)
            return [new_out]

+@register_opt()
+@local_optimizer([None])
+def my_optimizer(node):
+    if isinstance(node.op, tensor.basic.AdvancedIncSubtensor1) and not node.op.inplace and str(node.op) == "GpuAdvancedIncSubtensor1{no_inplace,inc}":
+       x, y, z = node.inputs
+       if (str(x) == "GpuAlloc{memset_0=True}.0"):
+           new_x = GpuAlloc(memset_0=True)(*x.owner.inputs)
+           new_op = node.op.__class__(inplace=True, set_instead_of_inc = node.op.set_instead_of_inc)
+           new_node = new_op(new_x, y, z)
+           return [new_node]
+    return False
+
+

 def safe_to_gpu(x):
    if (isinstance(x.type, tensor.TensorType) and