Initial additions for `GpuAdvancedIncSubtensor`

- `theano/gpuarray/opt.py` added optimisation to include new op - `theano/gpuarray/subtensor.py` added op - `theano/gpuarray/tests/test_subtensor.py` added simple test case

Initial additions for `GpuAdvancedIncSubtensor`
d2cd02d0 · Shawn Tan · 3796417a · d2cd02d0 · d2cd02d0 · d2cd02d0
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -68,6 +68,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedSubtensor,
                        GpuAdvancedSubtensor1,
+                        GpuAdvancedIncSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
@@ -1064,32 +1065,36 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
-@op_lifter([tensor.AdvancedIncSubtensor1])
+@op_lifter([tensor.AdvancedIncSubtensor1, tensor.AdvancedIncSubtensor])
-@register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
+@register_opt2([tensor.AdvancedIncSubtensor1, tensor.AdvancedIncSubtensor], 'fast_compile')
 def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
-    context = get_context(context_name)
+    if isinstance(op, (tensor.AdvancedIncSubtensor1)):
-    # This is disabled on non-cuda contexts
+        context = get_context(context_name)
-    if context.kind != b'cuda':
+        # This is disabled on non-cuda contexts
-        return None
+        if context.kind != b'cuda':
+            return None
-    x, y, ilist = inputs
+        x, y, ilist = inputs
-    set_instead_of_inc = op.set_instead_of_inc
+        set_instead_of_inc = op.set_instead_of_inc
-    compute_capability = int(context.bin_id[-2])
+        compute_capability = int(context.bin_id[-2])
-    if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0:
+        if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0:
-        x = x.dimshuffle(0, 'x')
+            x = x.dimshuffle(0, 'x')
-        y = y.dimshuffle('x', 'x')
+            y = y.dimshuffle('x', 'x')
-        ret = GpuAdvancedIncSubtensor1_dev20(
+            ret = GpuAdvancedIncSubtensor1_dev20(
-            set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
+                set_instead_of_inc=set_instead_of_inc)(x, y, ilist)
-        ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
+            ret = GpuDimShuffle(ret.type.broadcastable, [0])(ret)
-        return ret
+            return ret
-    elif compute_capability < 2 or x.ndim != 2 or y.ndim != 2:
+        elif compute_capability < 2 or x.ndim != 2 or y.ndim != 2:
-        return GpuAdvancedIncSubtensor1(
+            return GpuAdvancedIncSubtensor1(
-            set_instead_of_inc=set_instead_of_inc)
+                set_instead_of_inc=set_instead_of_inc)
-    else:
+        else:
-        return GpuAdvancedIncSubtensor1_dev20(
+            return GpuAdvancedIncSubtensor1_dev20(
-            set_instead_of_inc=set_instead_of_inc)
+                set_instead_of_inc=set_instead_of_inc)
+    elif isinstance(op, (tensor.AdvancedIncSubtensor)):
+        return GpuAdvancedIncSubtensor()
 @register_inplace()

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -587,6 +587,101 @@ class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
        out[0] = o
+class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
+    """
+    Implement AdvancedIncSubtensor on the gpu.
+    """
+    def make_node(self, x, y, *inputs):
+        ctx_name = infer_context_name(x)
+        rval = tensor.AdvancedIncSubtensor.make_node(self, x, y, *inputs)
+        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
+                             broadcastable=rval.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
+    # We can't use the parent version that loops on each index
+    # as we also need to loop when set_instead_of_inc is True and the
+    # parent doesn't loop in that case.
+    def perform(self, node, inp, out_, ctx=None):
+        out, = out_
+        x = inp[0]
+        y = inp[1]
+        idx = inp[2:]
+        x = x.copy()
+        # detect and transpose array indices
+        nidx = []
+        nshp = list(x.shape)
+        for k, i in enumerate(idx):
+            if i is None:
+                nidx.append(slice(None))
+                nshp.insert(k, 1)
+            else:
+                nidx.append(i)
+        x_ = x.reshape(nshp)
+        narrays = 0
+        transp = list(range(x_.ndim))
+        p = 0
+        # ap gives the position of the array in case there is only one.
+        # if there are more than one (narray > 1) it should be ignored.
+        ap = 0
+        for k, i in enumerate(list(nidx)):
+            if (isinstance(i, np.ndarray) and
+                    i.ndim != 0):
+                transp.remove(k)
+                transp.insert(p, k)
+                ap += k
+                i = nidx.pop(k)
+                nidx.insert(p, i)
+                p += 1
+                narrays += 1
+            else:
+                if narrays == 0:
+                    try:
+                        i.__index__()
+                        # We shift back the position of the array by the
+                        # number of dimensions that are removed by
+                        # indexing.  If ap is bigger than 0 it means we
+                        # have encountered at least one array.
+                        if ap >= 0:
+                            ap -= 1
+                        # If this index is before the first array then
+                        # we will not move the array back to its
+                        # position.  Mark this by faking that there
+                        # are more than two arrays.  This is crazy
+                        # numpy behaviour so blame them.
+                        narrays = 2
+                    except Exception:
+                        pass
+        x_ = x_.transpose(*transp)
+        idx_ = ([slice(None)] * p + nidx[p:])
+        x_ = x_.__getitem__(idx_)
+        # flatten the array-indexed dimensions
+        shape = ((np.prod(x_.shape[0: p]),) +
+                 x_.shape[p:])
+        x_flat = x_.reshape(shape)
+        # build the strides
+        strides = [1]
+        for i in range(p - 1, 0, -1):
+            stride = x_.shape[i] * strides[0]
+            strides.insert(0, stride)
+        # build the indices and use it
+        take_idx = sum((i * s for i, s in zip(nidx, strides)))
+        k = get_iadd(node.inputs[0], node.inputs[1])
+        y = pygpu.asarray(y, context=x_flat.context)
+        for j, i in enumerate(take_idx):
+            k(x_flat[i], y[j], broadcast=True)
+        out[0] = x
 class GpuAdvancedIncSubtensor1(Op):
    """
    Implement AdvancedIncSubtensor1 on the gpu.

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -13,6 +13,7 @@ from ..elemwise import GpuDimShuffle
 from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
                         GpuAdvancedSubtensor1,
                         GpuAdvancedSubtensor,
+                         GpuAdvancedIncSubtensor,
                         GpuAdvancedIncSubtensor1,
                         GpuAdvancedIncSubtensor1_dev20,
                         GpuExtractDiag,
@@ -76,6 +77,27 @@ class G_subtensorF16(test_subtensor.T_subtensor):
        assert self.sub == GpuSubtensor
+def test_advinc_subtensor():
+    shp = (3, 3, 3)
+    shared = gpuarray_shared_constructor
+    xval = np.arange(np.prod(shp), dtype='float32').reshape(shp) + 1
+    yval = np.arange(np.prod(shp[1:]), dtype='float32').reshape(shp[1:])
+    idx = ([0, 1, 2], [0, 1, 2])
+    x = shared(xval, name='x')
+    y = tensor.tensor(dtype='float32',
+                      broadcastable=(False, False),
+                      name='y')
+    expr = tensor.advanced_inc_subtensor(x, y, *idx)
+    f = theano.function([y], expr, mode=mode_with_gpu)
+    assert sum([isinstance(node.op, GpuAdvancedIncSubtensor)
+                for node in f.maker.fgraph.toposort()]) == 1
+    rval = f(yval)
+    rep = xval.copy()
+    rep[idx] += yval
+    assert np.allclose(rval, rep)
+>>>>>>> Initial additions for `GpuAdvancedIncSubtensor`
 def test_advinc_subtensor1():
    # Test the second case in the opt local_gpu_advanced_incsubtensor1
    for shp in [(3, 3), (3, 3, 3)]:
@@ -199,6 +221,7 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
            self, name,
            shared=gpuarray_shared_constructor,
            sub=GpuAdvancedSubtensor,
+            inc_sub=GpuAdvancedIncSubtensor,
            mode=mode_with_gpu,
            # avoid errors with limited devices
            dtype='float32',  # floatX?