Mostly adapted Op and tests to new backend. TODO: Remove faulty python…

Mostly adapted Op and tests to new backend. TODO: Remove faulty python implementation from _dev20 version of op

Mostly adapted Op and tests to new backend. TODO: Remove faulty python…
fff9c1f7 · Pierre Luc Carrier · 6936dd28 · fff9c1f7 · fff9c1f7 · fff9c1f7
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -10,6 +10,7 @@ from theano.gof import (local_optimizer, EquilibriumDB,
 from theano.gof.python25 import all, any
 from theano.tensor.nnet.conv import ConvOp
+from theano.sandbox.cuda.basic_ops import device_properties
 from theano.sandbox.gpuarray.type import GpuArrayType
 from theano.sandbox.gpuarray.basic_ops import (host_from_gpu,
                                               gpu_from_host,
@@ -25,7 +26,9 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi
                                          GpuSoftmax)
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduceCuda)
-from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
+                                               GpuAdvancedIncSubtensor1,
+                                               GpuAdvancedIncSubtensor1_dev20)
 from theano.sandbox.gpuarray.type import GpuArrayConstant
 gpu_optimizer = EquilibriumDB()
@@ -243,6 +246,23 @@ def local_gpua_incsubtensor(node):
                           node.op.destroyhandler_tolerate_aliased)
+@register_opt()
+@op_lifter([tensor.AdvancedIncSubtensor1])
+def local_gpua_advanced_incsubtensor(node):
+    x, y = node.inputs[0:2]
+    coords = node.inputs[2:]
+    set_instead_of_inc = node.op.set_instead_of_inc
+    active_device_no = theano.sandbox.cuda.active_device_number()
+    compute_capability = device_properties(active_device_no)['major']
+    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
+        return GpuAdvancedIncSubtensor1(
+                    set_instead_of_inc=set_instead_of_inc)
+    else:
+        return GpuAdvancedIncSubtensor1_dev20(
+                    set_instead_of_inc=set_instead_of_inc)
 @register_opt()
 @op_lifter([tensor.CAReduce, tensor.Sum])
 def local_gpua_careduce(node):

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -4,9 +4,10 @@ import StringIO
 import numpy
 import theano
-from theano import tensor, gof
+from theano import tensor, gof, Op
 from theano.gof.python25 import all, any
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
+from theano.sandbox.cuda.basic_ops import device_properties
 from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
 try:
@@ -358,13 +359,13 @@ class GpuIncSubtensor(IncSubtensor):
        return parent_version + elemwise_version + (0,)
-class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
+class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
    """
    Implement AdvancedIncSubtensor1 on the gpu.
    """
    def make_node(self, x, y, ilist):
-        x_ = as_cuda_ndarray_variable(x)
+        x_ = as_gpuarray_variable(x)
-        y_ = as_cuda_ndarray_variable(y)
+        y_ = as_gpuarray_variable(y)
        ilist_ = tensor.as_tensor_variable(ilist)
        assert x_.type.dtype == y_.type.dtype
@@ -380,7 +381,7 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
            # the caller should have made a copy of x len(ilist) times
            raise TypeError('cannot index into a broadcastable dimension')
-        return Apply(self, [x_, y_, ilist_], [x_.type()])
+        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
    # CudaNdarray_Subscript() doesn't support Advanced slicing.
    # But we can't use the parent version that loops on each index
@@ -393,13 +394,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        if not self.inplace:
            x = x.copy()
        if self.set_instead_of_inc:
-            # CudaNdarray __setitem__ doesn't do broadcast nor support
-            # list of index.
            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
            if y.ndim == x.ndim:
                assert len(y) == len(idx)
                for (j, i) in enumerate(idx):
+                    try:
                        x[i] = y[j]
+                    except:
+                        import pdb
+                        pdb.set_trace()
            else:
                for i in idx:
                    x[i] = y
@@ -411,108 +414,17 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
            if y.ndim == x.ndim:
                assert len(y) == len(idx)
                for (j, i) in enumerate(idx):                    
-                    x[i] += y[j]
+                    #x[i] += y[j]
+                    pygpu.elemwise.ielemwise2(x[i], '+', y[j],  broadcast=False)
            else:
                for i in idx:
-                    x[i] += y
+                    #x[i] += y
-        out[0] = x
+                    nb_dims_to_add = (x[i].ndim - y.ndim)
+                    reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
-    def c_code_cache_version(self):
+                    pygpu.elemwise.ielemwise2(x[i], '+', reshaped_y, 
-        return (3,)
+                                              broadcast=True)
-    def c_code(self, node, name, inputs, outputs, sub):
-        if (self.set_instead_of_inc) or \
-           (node.inputs[0].ndim != node.inputs[1].ndim):
-            raise NotImplementedError("This case does not have C code yet.")
-        x = inputs[0]
-        y = inputs[1]
-        ind = inputs[2]
-        out = outputs[0]
-        fail = sub['fail']
-        inplace = int(self.inplace)
-        return """
-        PyObject *x_obj, *y_obj, *row_x, *row_y;
-        PyObject *x_rowind_obj, *y_rowind_obj;
-        dtype_%(ind)s *p_index;
-        int num_indices, j;
-        int ret;
-        num_indices = PyArray_SIZE(%(ind)s);
-        if ((num_indices - 1) > LONG_MAX) {
-            PyErr_Format(PyExc_AssertionError,
-                         "num_indices %%d exceeds LONG_MAX + 1", num_indices);
-            %(fail)s;
-        }
-        Py_XDECREF(%(out)s);
+        out[0] = x  
-        if (!%(inplace)s) {
-            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
-        } else {
-            %(out)s = %(x)s;
-            Py_XINCREF(%(out)s);
-        }
-        x_obj = (PyObject*)CudaNdarray_View(%(out)s);
-        y_obj = (PyObject*)CudaNdarray_View(%(y)s);
-        for (j = 0;j < num_indices; j++) {
-             p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
-             x_rowind_obj = PyInt_FromLong(*p_index);
-             if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
-                 PyErr_Format(PyExc_AssertionError,
-                              "Error in converting row index to integer from long");
-                 // Dec Ref what ever we have increfed or allocated so far
-                 // We deallocate objects exactly in the reverse order they were allocated.
-                 Py_XDECREF(x_rowind_obj);
-                 Py_XDECREF(y_obj);
-                 Py_XDECREF(x_obj);
-                 %(fail)s;
-             }
-             y_rowind_obj = PyInt_FromLong(j);
-             row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
-             row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
-             if ((row_x == NULL) || (row_y == NULL)) {
-                  Py_XDECREF(row_y);
-                  Py_XDECREF(row_x);
-                  Py_XDECREF(y_rowind_obj);
-                  Py_XDECREF(x_rowind_obj);
-                  Py_XDECREF(y_obj);
-                  Py_XDECREF(x_obj);
-                  %(fail)s;
-             }
-             ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
-             if (ret != 0) {
-                 Py_XDECREF(row_y);
-                 Py_XDECREF(row_x);
-                 Py_XDECREF(y_rowind_obj);
-                 Py_XDECREF(x_rowind_obj);
-                 Py_XDECREF(y_obj);
-                 Py_XDECREF(x_obj);
-                 %(fail)s;
-             }
-             Py_XDECREF(row_y);
-             Py_XDECREF(row_x);
-             Py_XDECREF(y_rowind_obj);
-             Py_XDECREF(x_rowind_obj);
-        }
-        Py_XDECREF(y_obj);
-        Py_XDECREF(x_obj);
-        if (!%(out)s) {
-            %(fail)s
-        }
-        """ % locals()
 class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
@@ -520,21 +432,17 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
    only avail on compute capability 2.0 and more recent.
    """
+    def __init__(self, inplace=False, set_instead_of_inc=False):
+        # The python implementation in the parent class is not applicable here
+        GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
    def make_node(self, x, y, ilist):
        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
        the index are of type long.
        """
-        x_ = as_cuda_ndarray_variable(x)
+        x_ = as_gpuarray_variable(x)
-        y_ = as_cuda_ndarray_variable(y)
+        y_ = as_gpuarray_variable(y)
-        ilist_ = tensor.as_tensor_variable(ilist)
+        ilist_ = as_gpuarray_variable(ilist)
-        convert_map = {8: tensor.basic._convert_to_int8,
-                       16: tensor.basic._convert_to_int16,
-                       32: tensor.basic._convert_to_int32,
-                       64: tensor.basic._convert_to_int64
-        }
-        intwidth = theano.gof.compiledir.python_int_bitwidth()
-        ilist_ = convert_map[intwidth](ilist_)
        assert x_.type.dtype == y_.type.dtype
        assert x_.type.ndim >= y_.type.ndim
@@ -549,11 +457,21 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
            # the caller should have made a copy of x len(ilist) times
            raise TypeError('cannot index into a broadcastable dimension')
-        return Apply(self, [x_, y_, ilist_], [x_.type()])
+        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
    def c_code_cache_version(self):
        return (2,)
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
+                '<compyte/ext_cuda.h>']
+    def c_compiler(self):
+        return NVCC_compiler
+    def c_init_code(self):
+        return ['setup_ext_cuda();']
    def c_code(self, node, name, inputs, outputs, sub):
        active_device_no = theano.sandbox.cuda.active_device_number()
        compute_capability = device_properties(active_device_no)['major']
@@ -572,13 +490,13 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return """
        Py_XDECREF(%(out)s);
        if (!%(inplace)s) {
-            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
+            %(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER);
        } else {
            %(out)s = %(x)s;
            Py_XINCREF(%(out)s);
        }
-        CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
+        GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
        if (!%(out)s) {
            %(fail)s
@@ -586,26 +504,35 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        """ % locals()
    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_y = node.inputs[1].dtype
+        dtype_ind = node.inputs[2].dtype
+        dtype_out = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_y = numpy.dtype(dtype_y).itemsize
+        itemsize_ind = numpy.dtype(dtype_ind).itemsize
+        itemsize_out = numpy.dtype(dtype_out).itemsize
        return """
        __global__ void k_vector_add_fast(int numRowsX,
                                          int numColsX,
                                          int stridesX0,
                                          int stridesX1,
-                                          float *X,
+                                          npy_%(dtype_x)s *X,
                                          int numRowsY,
                                          int numColsY,
                                          int stridesY0,
                                          int stridesY1,
-                                          float *Y ,
+                                          npy_%(dtype_y)s *Y,
-                                          long *d_indices_arr,
+                                          int numIndices,
-                                          int num)
+                                          int stridesIndices,
+                                          npy_%(dtype_ind)s *indices_arr)
        {
-             for (int i = (blockIdx.x); i < num; i += gridDim.x)
+             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
             {
                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
                  {
-                      int x_row = d_indices_arr[i];
+                      int x_row = indices_arr[i * stridesIndices];
                      int y_row = i;
                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
                  }
@@ -613,49 +540,39 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
             return;
        }
-        void CudaNdarray_vector_add_fast(CudaNdarray* py_self, CudaNdarray* py_other, PyArrayObject *indices_arr)
+        void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
+                                      PyGpuArrayObject* py_other,
+                                      PyGpuArrayObject *indices_arr)
        {
-                const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
+                int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
-                const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
+                                                     (size_t)256);
-                const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
+                int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
-                const int *strY   = CudaNdarray_HOST_STRIDES(py_other);
+                                          (size_t)4096);
-                unsigned int size = (unsigned int)PyArray_SIZE(indices_arr);
-                unsigned int numcolsX = shapeX[1];
-                unsigned int num_threads_per_block = std::min(numcolsX, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
-                unsigned int num_blocks = std::min(size ,(unsigned int)NUM_VECTOR_OP_BLOCKS);
                dim3 n_blocks(num_blocks);
                dim3 n_threads(num_threads_per_block);
-                long *d_indices_arr = NULL;
+                k_vector_add_fast<<<n_blocks, n_threads>>>(
-                PyArrayObject *cpu_indices_arr = PyArray_GETCONTIGUOUS(indices_arr);
+                        PyGpuArray_DIMS(py_self)[0],
+                        PyGpuArray_DIMS(py_self)[1],
-                d_indices_arr = (long*)device_malloc(PyArray_NBYTES(cpu_indices_arr));
+                        PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s,
-                assert(d_indices_arr);
+                        PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s,
+                        (npy_%(dtype_x)s*)(
-                cudaError_t err = cudaMemcpy(d_indices_arr,
+                            ((char *)cuda_get_ptr(py_self->ga.data)) +
-                                             PyArray_DATA(cpu_indices_arr),
+                            py_self->ga.offset),
-                                             PyArray_NBYTES(cpu_indices_arr),
+                        PyGpuArray_DIMS(py_other)[0],
-                                             cudaMemcpyHostToDevice);
+                        PyGpuArray_DIMS(py_other)[1],
+                        PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s,
-                assert(err == cudaSuccess);
+                        PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s,
+                        (npy_%(dtype_x)s*)(
-                k_vector_add_fast<<<n_blocks, n_threads>>>(shapeX[0],
+                            ((char *)cuda_get_ptr(py_other->ga.data)) +
-                                                           shapeX[1],
+                            py_other->ga.offset),
-                                                           strX[0],
+                        PyGpuArray_DIMS(indices_arr)[0],
-                                                           strX[1],
+                        PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
-                                                           CudaNdarray_DEV_DATA(py_self),
+                        (npy_%(dtype_ind)s*)(
-                                                           shapeY[0],
+                            ((char *)cuda_get_ptr(indices_arr->ga.data)) +
-                                                           shapeY[1],
+                            indices_arr->ga.offset)
-                                                           strY[0],
-                                                           strY[1],
-                                                           CudaNdarray_DEV_DATA(py_other),
-                                                           d_indices_arr,
-                                                           PyArray_SIZE(indices_arr)
                );
-                device_free(d_indices_arr);
-                Py_XDECREF(cpu_indices_arr);
                return;
        }

--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
+import numpy
+import theano
 from theano.tensor.tests.test_subtensor import T_subtensor
 from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
-from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
+                                               GpuAdvancedIncSubtensor1)
 from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
@@ -21,6 +25,7 @@ class G_subtensor(T_subtensor):
                             shared=gpuarray_shared_constructor,
                             sub=GpuSubtensor,
                             inc_sub=GpuIncSubtensor,
+                             adv_incsub1 = GpuAdvancedIncSubtensor1,
                             mode=mode_with_gpu,
                             # avoid errors with limited devices
                             dtype='float32',
@@ -34,17 +39,17 @@ class G_subtensor(T_subtensor):
 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
    for shp in [(3, 3), (3, 3, 3)]:
-        shared = cuda.shared_constructor
+        shared = gpuarray_shared_constructor
        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
        yval = numpy.empty((2,) + shp[1:], dtype='float32')
        yval[:] = 10
        x = shared(xval, name='x')
-        y = T.tensor(dtype='float32',
+        y = tensor.tensor(dtype='float32',
                     broadcastable=(False,) * len(shp),
                     name='y')
-        expr = T.advanced_inc_subtensor1(x, y, [0, 2])
+        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
        f = theano.function([y], expr, mode=mode_with_gpu)
-        assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
+        assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
                    for node in f.maker.fgraph.toposort()]) == 1
        rval = f(yval)
        rep = xval.copy()