Merge pull request #1316 from viveksck/try_nouiz

WIP:Speeding up GpuAdvancedIncSubTensor1 by writing fast Cuda Code

Merge pull request #1316 from viveksck/try_nouiz
4ba74e22 · Frédéric Bastien · 60ad703d · 305b11ae · 4ba74e22 · 4ba74e22
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2444,7 +2444,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        out[0] = x
    def c_code_cache_version(self):
-        return (1,)
+        return (3,)
    def c_code(self, node, name, inputs, outputs, sub):
        if (self.set_instead_of_inc) or \
@@ -2467,7 +2467,8 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        num_indices = PyArray_SIZE(%(ind)s);
        if ((num_indices - 1) > LONG_MAX) {
-            PyErr_Format(PyExc_AssertionError, "num_indices %%d exceeds LONG_MAX + 1", num_indices);
+            PyErr_Format(PyExc_AssertionError,
+                         "num_indices %%d exceeds LONG_MAX + 1", num_indices);
            %(fail)s;
        }
@@ -2489,7 +2490,8 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
             x_rowind_obj = PyInt_FromLong(*p_index);
             if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
-                 PyErr_Format(PyExc_AssertionError, "Error in converting row index to integer from long");
+                 PyErr_Format(PyExc_AssertionError,
+                              "Error in converting row index to integer from long");
                 // Dec Ref what ever we have increfed or allocated so far
                 // We deallocate objects exactly in the reverse order they were allocated.
                 Py_XDECREF(x_rowind_obj);
@@ -2536,6 +2538,153 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        if (!%(out)s) {
            %(fail)s
        }
+        """ % locals()
+class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
+    """Implement AdvancedIncSubtensor1 on the gpu, but use function
+    only avail on compute capability 2.0 and more recent.
+    """
+    def make_node(self, x, y, ilist):
+        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
+        the index are of type long.
+        """
+        x_ = as_cuda_ndarray_variable(x)
+        y_ = as_cuda_ndarray_variable(y)
+        ilist_ = tensor.as_tensor_variable(ilist)
+        convert_map = {8: tensor.basic._convert_to_int8,
+                       16: tensor.basic._convert_to_int16,
+                       32: tensor.basic._convert_to_int32,
+                       64: tensor.basic._convert_to_int64
+        }
+        intwidth = theano.gof.compiledir.python_int_bitwidth()
+        ilist_ = convert_map[intwidth](ilist_)
+        assert x_.type.dtype == y_.type.dtype
+        assert x_.type.ndim >= y_.type.ndim
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.broadcastable != (False,):
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
+        return Apply(self, [x_, y_, ilist_], [x_.type()])
+    def c_code_cache_version(self):
+        return (2,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        active_device_no = theano.sandbox.cuda.active_device_number()
+        compute_capability = device_properties(active_device_no)['major']
+        if ((self.set_instead_of_inc) or
+            (node.inputs[0].ndim != node.inputs[1].ndim) or
+            (node.inputs[0].ndim != 2) or
+            (compute_capability < 2)):
+            raise NotImplementedError("This case does not have C code yet.")
+        x = inputs[0]
+        y = inputs[1]
+        ind = inputs[2]
+        out = outputs[0]
+        fail = sub['fail']
+        inplace = int(self.inplace)
+        return """
+        Py_XDECREF(%(out)s);
+        if (!%(inplace)s) {
+            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
+        } else {
+            %(out)s = %(x)s;
+            Py_XINCREF(%(out)s);
+        }
+        CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
+        if (!%(out)s) {
+            %(fail)s
+        }
+        """ % locals()
+    def c_support_code_apply(self, node, nodename):
+        return """
+        __global__ void k_vector_add_fast(int numRowsX,
+                                          int numColsX,
+                                          int stridesX0,
+                                          int stridesX1,
+                                          float *X,
+                                          int numRowsY,
+                                          int numColsY,
+                                          int stridesY0,
+                                          int stridesY1,
+                                          float *Y ,
+                                          long *d_indices_arr,
+                                          int num)
+        {
+             for (int i = (blockIdx.x); i < num; i += gridDim.x)
+             {
+                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
+                  {
+                      int x_row = d_indices_arr[i];
+                      int y_row = i;
+                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
+                  } 
+             }
+             return;
+        }      
+	void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
+	{
+     		const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
+     		const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
+     		const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
+     		const int *strY   = CudaNdarray_HOST_STRIDES(py_other);
+     		unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
+     		unsigned int numcolsX = shapeX[1];
+     		unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
+     		unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
+     		dim3 n_blocks(num_blocks);
+     		dim3 n_threads(num_threads_per_block);
+     		long *d_indices_arr = NULL;
+     		PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
+     		d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
+     		assert(d_indices_arr);
+     		cudaError_t err = cudaMemcpy(d_indices_arr,
+                                             PyArray_DATA(cpu_indices_arr),
+                                             PyArray_NBYTES(cpu_indices_arr),
+                                             cudaMemcpyHostToDevice);
+     		assert(err == cudaSuccess);
+     		k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
+                                                           shapeX[1],
+                                                           strX[0],
+                                                           strX[1],
+                                                           CudaNdarray_DEV_DATA(py_self),
+                                                           shapeY[0],
+                                                           shapeY[1],
+                                                           strY[0],
+                                                           strY[1],
+                                                           CudaNdarray_DEV_DATA(py_other),
+                                                           d_indices_arr,
+                                                           PyArray_SIZE(indices_arr)
+                                                          );
+     		device_free(d_indices_arr);
+     		Py_XDECREF(cpu_indices_arr);
+     		return;
+	}
        """ %locals()
 class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -522,7 +522,6 @@ DllExport PyObject * CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_ot
 DllExport PyObject * CudaNdarray_Subscript(PyObject * py_self, PyObject * key);
 DllExport int CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb);
 // Ensures that *arr is a pointer to a contiguous ndarray of the specified
 // dimensions.
 // *arr may initially be NULL, a pointer to an ndarray of the wrong size,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -781,9 +781,16 @@ def local_gpu_advanced_incsubtensor1(node):
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
+            active_device_no = theano.sandbox.cuda.active_device_number()
+            compute_capability = device_properties(active_device_no)['major']
+            if (compute_capability < 2 or
+                x.ndim != 2 or
+                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
+            else:
+                gpu_op = GpuAdvancedIncSubtensor1_dev20(
+                    set_instead_of_inc=set_instead_of_inc)
            return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
@@ -814,8 +821,16 @@ def local_gpu_advanced_incsubtensor1(node):
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
+            active_device_no = theano.sandbox.cuda.active_device_number()
+            compute_capability = device_properties(active_device_no)['major']
+            if (compute_capability < 2 or
+                x.ndim != 2 or
+                y.ndim != 2):
                gpu_op = GpuAdvancedIncSubtensor1(
                    set_instead_of_inc=set_instead_of_inc)
+            else:
+                gpu_op = GpuAdvancedIncSubtensor1_dev20(
+                    set_instead_of_inc=set_instead_of_inc)
            return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
    return False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1005,20 +1005,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    for shp in [(3, 3), (3, 3, 3)]:
        shared = cuda.shared_constructor
-    #shared = tensor.shared
+        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
-    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        yval = numpy.empty((2,) + shp[1:], dtype='float32')
-                      dtype='float32')
+        yval[:] = 10
-    yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
-                      dtype='float32')
        x = shared(xval, name='x')
-    y = T.fmatrices('y')
+        y = T.tensor(dtype='float32',
+                     broadcastable=(False,) * len(shp),
+                     name='y')
        expr = T.advanced_inc_subtensor1(x, y, [0, 2])
        f = theano.function([y], expr, mode=mode_with_gpu)
        assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
                    for node in f.maker.fgraph.toposort()]) == 1
-    assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
+        rval = f(yval)
-                                    [17., 18., 19.]])
+        rep = xval.copy()
+        rep[[0, 2]] += yval
+        assert numpy.allclose(rval, rep)
 def test_inc_subtensor():