Merge pull request #1828 from nouiz/carriepl-subtensor

gh-1816 PR with small fixes

Merge pull request #1828 from nouiz/carriepl-subtensor
a1796d2d · Frédéric Bastien · c6c37e1d · 103e51ef · a1796d2d · a1796d2d
--- a/theano/sandbox/gpuarray/elemwise.py
+++ b/theano/sandbox/gpuarray/elemwise.py
@@ -154,6 +154,12 @@ class GpuElemwise(HideC, Elemwise):
 #define ga_half uint16_t

 """
+        try:
+            #We accept only some c_support_code().
+            #This filter is done in the make_node()
+            support_code += self.scalar_op.c_support_code()
+        except MethodNotDefined:
+            pass
        for npy, ga in [("npy_uint8", "ga_ubyte"),
                        ("npy_uint16", "ga_ushort"),
                        ("npy_uin32", "ga_uint"),
@@ -179,6 +185,9 @@ class GpuElemwise(HideC, Elemwise):
            raise MethodNotDefined('cuda only')
        return NVCC_compiler

+    def c_support_code(self):
+        return self.scalar_op.c_support_code()
+
    def c_support_code_apply(self, node, nodename):
        if pygpu.get_default_context().kind == 'opencl':
            raise MethodNotDefined('cuda only')

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
 import copy
 import theano
 import numpy
+
+try:
+    import pygpu
+except ImportError:
+    pass
+
 from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
@@ -26,7 +32,9 @@ from theano.sandbox.gpuarray.nnet import (
    )
 from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduceCuda)
-from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
+                                               GpuAdvancedIncSubtensor1,
+                                               GpuAdvancedIncSubtensor1_dev20)
 from theano.sandbox.gpuarray.type import GpuArrayConstant

 gpu_optimizer = EquilibriumDB()
@@ -271,6 +279,30 @@ def local_gpua_incsubtensor(node):
    return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
                           node.op.set_instead_of_inc,
                           node.op.destroyhandler_tolerate_aliased)
+                           
+
+@register_opt()
+@op_lifter([tensor.AdvancedIncSubtensor1])
+def local_gpua_advanced_incsubtensor(node):
+    
+    # This optimization is disabled if cuda is not active
+    if pygpu.get_default_context().kind != "cuda":
+        return None
+    
+    x, y = node.inputs[0:2]
+    coords = node.inputs[2:]
+    set_instead_of_inc = node.op.set_instead_of_inc
+    active_device_no = theano.sandbox.cuda.active_device_number()
+    device_properties = theano.sandbox.cuda.device_properties
+
+    compute_capability = device_properties(active_device_no)['major']
+    
+    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
+        return GpuAdvancedIncSubtensor1(
+                    set_instead_of_inc=set_instead_of_inc)
+    else:
+        return GpuAdvancedIncSubtensor1_dev20(
+                    set_instead_of_inc=set_instead_of_inc)


 @register_opt()

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
 import copy
 import StringIO
-
 import numpy

 import theano
-from theano import tensor, gof
+from theano import tensor, gof, Op
 from theano.gof.python25 import all, any
 from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
 import theano.tensor.inplace
@@ -357,3 +356,239 @@ class GpuIncSubtensor(IncSubtensor):
        if not parent_version or not elemwise_version:
            return
        return parent_version + elemwise_version + (0,)
+
+
+class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
+    """
+    Implement AdvancedIncSubtensor1 on the gpu.
+    """
+    def make_node(self, x, y, ilist):
+        x_ = as_gpuarray_variable(x)
+        y_ = as_gpuarray_variable(y)
+        ilist_ = tensor.as_tensor_variable(ilist)
+
+        assert x_.type.dtype == y_.type.dtype
+        assert x_.type.ndim >= y_.type.ndim
+
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.broadcastable != (False,):
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
+
+        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
+
+    def getInplElemwiseAdditionKernel(self, a, b):
+        a_arg = pygpu.tools.as_argument(a, 'a')
+        b_arg = pygpu.tools.as_argument(b, 'b')
+        args = [a_arg, b_arg]
+        oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
+        k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
+        return k
+
+    # We can't use the parent version that loops on each index
+    # as we also need to loop when set_instead_of_inc is True and the
+    # parent doesn't loop in that case.
+    def perform(self, node, inp, out_):
+        # TODO opt to make this inplace
+        x, y, idx = inp
+        out, = out_
+
+        # Make sure idx is not a GpuArray otherwise we cannot use its content
+        # to index x and y
+        if isinstance(idx, gpuarray.GpuArray):
+            idx = numpy.asarray(idx)
+
+        if not self.inplace:
+            x = x.copy()
+        if self.set_instead_of_inc:
+            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+            if y.ndim == x.ndim:
+                assert len(y) == len(idx)
+                for (j, i) in enumerate(idx):
+                    x[i] = y[j]
+            else:
+                for i in idx:
+                    x[i] = y
+        else:
+            # If `y` has as many dimensions as `x`, then we want to iterate
+            # jointly on `x` and `y`. Otherwise, it means `y` should be
+            # broadcasted to fill all relevant rows of `x`.
+            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
+
+            if len(idx) == 0:
+                pass
+            elif y.ndim == x.ndim:
+                assert len(y) == len(idx)
+
+                k = self.getInplElemwiseAdditionKernel(x[0], y[0])
+
+                for (j, i) in enumerate(idx):
+                    k(x[i], y[j], broadcast=False)
+            else:
+                nb_dims_to_add = (x.ndim - 1) - y.ndim
+                reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
+                k = self.getInplElemwiseAdditionKernel(x[0],
+                                                       reshaped_y)
+
+                for i in idx:
+                    k(x[i], reshaped_y, broadcast=True)
+
+        out[0] = x
+
+
+class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
+    """Implement AdvancedIncSubtensor1 on the gpu, but use function
+    only avail on compute capability 2.0 and more recent.
+    """
+
+    def __init__(self, inplace=False, set_instead_of_inc=False):
+        # The python implementation in the parent class is not applicable here
+        GpuAdvancedIncSubtensor1.__init__(self, inplace, set_instead_of_inc)
+
+    def make_node(self, x, y, ilist):
+        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
+        the index are of type long.
+        """
+        x_ = as_gpuarray_variable(x)
+        y_ = as_gpuarray_variable(y)
+        ilist_ = as_gpuarray_variable(ilist)
+
+        assert x_.type.dtype == y_.type.dtype
+        assert x_.type.ndim >= y_.type.ndim
+
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.broadcastable != (False,):
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
+
+        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])
+
+    def c_code_cache_version(self):
+        return (2,)
+
+    def c_headers(self):
+        return ['cuda.h', '<compyte/extension.h>', '<numpy_compat.h>',
+                '<compyte/ext_cuda.h>']
+
+    def c_compiler(self):
+        return NVCC_compiler
+
+    def c_init_code(self):
+        return ['setup_ext_cuda();']
+
+    def c_code(self, node, name, inputs, outputs, sub):
+        active_device_no = theano.sandbox.cuda.active_device_number()
+        device_properties = theano.sandbox.cuda.device_properties
+        compute_capability = device_properties(active_device_no)['major']
+        if ((self.set_instead_of_inc) or
+            (node.inputs[0].ndim != node.inputs[1].ndim) or
+            (node.inputs[0].ndim != 2) or
+            (compute_capability < 2)):
+            raise NotImplementedError("This case does not have C code yet.")
+
+        x = inputs[0]
+        y = inputs[1]
+        ind = inputs[2]
+        out = outputs[0]
+        fail = sub['fail']
+        inplace = int(self.inplace)
+        return """
+        Py_XDECREF(%(out)s);
+        if (!%(inplace)s) {
+            %(out)s = (PyGpuArrayObject*)pygpu_copy(%(x)s, GA_C_ORDER);
+        } else {
+            %(out)s = %(x)s;
+            Py_XINCREF(%(out)s);
+        }
+
+        GpuArray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
+
+        if (!%(out)s) {
+            %(fail)s
+        }
+        """ % locals()
+
+    def c_support_code_apply(self, node, nodename):
+        dtype_x = node.inputs[0].dtype
+        dtype_y = node.inputs[1].dtype
+        dtype_ind = node.inputs[2].dtype
+        dtype_out = node.outputs[0].dtype
+        itemsize_x = numpy.dtype(dtype_x).itemsize
+        itemsize_y = numpy.dtype(dtype_y).itemsize
+        itemsize_ind = numpy.dtype(dtype_ind).itemsize
+        itemsize_out = numpy.dtype(dtype_out).itemsize
+        return """
+
+        __global__ void k_vector_add_fast(int numRowsX,
+                                          int numColsX,
+                                          int stridesX0,
+                                          int stridesX1,
+                                          npy_%(dtype_x)s *X,
+                                          int numRowsY,
+                                          int numColsY,
+                                          int stridesY0,
+                                          int stridesY1,
+                                          npy_%(dtype_y)s *Y,
+                                          int numIndices,
+                                          int stridesIndices,
+                                          npy_%(dtype_ind)s *indices_arr)
+        {
+             for (int i = (blockIdx.x); i < numIndices; i += gridDim.x)
+             {
+                  for(int j = (threadIdx.x); j < numColsX;j += blockDim.x)
+                  {
+                      int x_row = indices_arr[i * stridesIndices];
+                      int y_row = i;
+                      atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]);
+                  }
+             }
+             return;
+        }
+
+        void GpuArray_vector_add_fast(PyGpuArrayObject* py_self,
+                                      PyGpuArrayObject* py_other,
+                                      PyGpuArrayObject *indices_arr)
+        {
+                int num_threads_per_block = std::min(PyGpuArray_DIMS(py_self)[1],
+                                                     (size_t)256);
+                int num_blocks = std::min(PyGpuArray_SIZE(indices_arr),
+                                          (size_t)4096);
+
+                dim3 n_blocks(num_blocks);
+                dim3 n_threads(num_threads_per_block);
+
+                k_vector_add_fast<<<n_blocks, n_threads>>>(
+                        PyGpuArray_DIMS(py_self)[0],
+                        PyGpuArray_DIMS(py_self)[1],
+                        PyGpuArray_STRIDES(py_self)[0] / %(itemsize_x)s,
+                        PyGpuArray_STRIDES(py_self)[1] / %(itemsize_x)s,
+                        (npy_%(dtype_x)s*)(
+                            ((char *)cuda_get_ptr(py_self->ga.data)) +
+                            py_self->ga.offset),
+                        PyGpuArray_DIMS(py_other)[0],
+                        PyGpuArray_DIMS(py_other)[1],
+                        PyGpuArray_STRIDES(py_other)[0] / %(itemsize_y)s,
+                        PyGpuArray_STRIDES(py_other)[1] / %(itemsize_y)s,
+                        (npy_%(dtype_x)s*)(
+                            ((char *)cuda_get_ptr(py_other->ga.data)) +
+                            py_other->ga.offset),
+                        PyGpuArray_DIMS(indices_arr)[0],
+                        PyGpuArray_STRIDES(indices_arr)[0] / %(itemsize_ind)s,
+                        (npy_%(dtype_ind)s*)(
+                            ((char *)cuda_get_ptr(indices_arr->ga.data)) +
+                            indices_arr->ga.offset)
+                );
+                return;
+        }
+
+        """ %locals()
--- a/theano/sandbox/gpuarray/tests/test_subtensor.py
+++ b/theano/sandbox/gpuarray/tests/test_subtensor.py
+import numpy
+
+import theano
 from theano.tensor.tests.test_subtensor import T_subtensor

 from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
-from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
+from theano.sandbox.gpuarray.subtensor import (GpuIncSubtensor, GpuSubtensor,
+                                               GpuAdvancedIncSubtensor1)

 from theano.sandbox.gpuarray.type import gpuarray_shared_constructor

@@ -21,6 +25,7 @@ class G_subtensor(T_subtensor):
                             shared=gpuarray_shared_constructor,
                             sub=GpuSubtensor,
                             inc_sub=GpuIncSubtensor,
+                             adv_incsub1=GpuAdvancedIncSubtensor1,
                             mode=mode_with_gpu,
                             # avoid errors with limited devices
                             dtype='float32',
@@ -29,3 +34,24 @@ class G_subtensor(T_subtensor):
        # GPU opt can't run in fast_compile only.
        self.fast_compile = False
        assert self.sub == GpuSubtensor
+
+
+def test_advinc_subtensor1():
+    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
+    for shp in [(3, 3), (3, 3, 3)]:
+        shared = gpuarray_shared_constructor
+        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
+        yval = numpy.empty((2,) + shp[1:], dtype='float32')
+        yval[:] = 10
+        x = shared(xval, name='x')
+        y = tensor.tensor(dtype='float32',
+                     broadcastable=(False,) * len(shp),
+                     name='y')
+        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
+        f = theano.function([y], expr, mode=mode_with_gpu)
+        assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1)
+                    for node in f.maker.fgraph.toposort()]) == 1
+        rval = f(yval)
+        rep = xval.copy()
+        rep[[0, 2]] += yval
+        assert numpy.allclose(rval, rep)
--- a/theano/sandbox/rng_mrg.py
+++ b/theano/sandbox/rng_mrg.py
@@ -29,6 +29,7 @@ if cuda_available:
 from theano.sandbox.gpuarray.basic_ops import GpuKernelBase
 from theano.sandbox.gpuarray.type import GpuArrayType

+
 def matVecModM(A, s, m):
    assert A.dtype == 'int64'
    return numpy.int32(numpy.sum((A*s) % m, 1) % m)

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            self.assertTrue(numpy.allclose(val, good), (val, good))

            # Test reuse of output memory
-            if isinstance(self.adv_sub1, tensor.AdvancedSubtensor1):
+            if type(self.adv_sub1) == tensor.AdvancedSubtensor1:
                op = self.adv_sub1()
                # When idx is a TensorConstant.
                if hasattr(idx, "data"):