Merge pull request #5881 from shawntan/issue-930

Implementing `GpuAdvancedIncSubtensor`

Merge pull request #5881 from shawntan/issue-930
4f3a52a5 · abergeron · GitHub · 86c195d2 · ebf5faf6 · 4f3a52a5
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -951,6 +951,7 @@ class GpuAlloc(HideC, Alloc):
                             (subtensor.GpuIncSubtensor,
                              subtensor.GpuAdvancedIncSubtensor1,
                              subtensor.GpuAdvancedIncSubtensor1_dev20,
+                              subtensor.GpuAdvancedIncSubtensor,
                              blas.GpuGemm, blas.GpuGemv,
                              blas.GpuGer)
                             )):

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -68,6 +68,7 @@ from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
 from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedSubtensor,
                        GpuAdvancedSubtensor1,
+                        GpuAdvancedIncSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
@@ -1066,7 +1067,7 @@ def local_gpua_advanced_subtensor(op, context_name, inputs, outputs):
 @register_opt('fast_compile')
 @op_lifter([tensor.AdvancedIncSubtensor1])
 @register_opt2([tensor.AdvancedIncSubtensor1], 'fast_compile')
-def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
+def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
    context = get_context(context_name)
    # This is disabled on non-cuda contexts
    if context.kind != b'cuda':
@@ -1094,6 +1095,16 @@ def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
            set_instead_of_inc=set_instead_of_inc)
+@register_opt('fast_compile')
+@op_lifter([tensor.AdvancedIncSubtensor])
+@register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile')
+def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
+    if not op.set_instead_of_inc:
+        return GpuAdvancedIncSubtensor()
+    else:
+        return False
 @register_inplace()
 @local_optimizer([GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20])
 def local_advincsub1_gpua_inplace(node):

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -589,6 +589,140 @@ class GpuAdvancedSubtensor(HideC, tensor.AdvancedSubtensor):
        out[0] = o
+class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
+    """
+    Implement AdvancedIncSubtensor on the gpu.
+    """
+    def make_node(self, x, y, *inputs):
+        ctx_name = infer_context_name(x, y)
+        rval = tensor.AdvancedIncSubtensor.make_node(self, x, y, *inputs)
+        otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
+                             broadcastable=rval.outputs[0].type.broadcastable,
+                             context_name=ctx_name)
+        x = as_gpuarray_variable(x, ctx_name)
+        y = as_gpuarray_variable(y, ctx_name)
+        return gof.Apply(self, [x, y] + rval.inputs[2:], [otype()])
+    def perform(self, node, inp, out_):
+        out, = out_
+        x = inp[0]
+        y = inp[1]
+        idx = inp[2:]
+        x = x.copy()
+        # convert all indices to np.array
+        for i in range(len(idx)):
+            if isinstance(idx[i], gpuarray.GpuArray):
+                idx[i] = np.asarray(idx[i])
+        # Insert axes for None indexing
+        nidx = []
+        nshp = list(x.shape)
+        for k, i in enumerate(idx):
+            if i is None:
+                nidx.append(slice(None))
+                nshp.insert(k, 1)
+            else:
+                nidx.append(i)
+        x_ = x.reshape(nshp)
+        # Bring array indices to front
+        transp = []
+        nidx_ = []
+        p = 0
+        for k, i in enumerate(list(nidx)):
+            if isinstance(i, np.ndarray) and i.ndim != 0:
+                transp.append(k)
+                nidx_.append(i)
+                p += 1
+        for k, i in enumerate(list(nidx)):
+            if not (isinstance(i, np.ndarray) and i.ndim != 0):
+                transp.append(k)
+                nidx_.append(i)
+        transp = transp + list(range(len(transp), x_.ndim))
+        rtransp = [i for i, _ in sorted(enumerate(transp), key=lambda x:x[1])]
+        nidx = nidx_
+        # transp: order to shuffle axes of x so that single dimension
+        #         subarrays are extracted first
+        # p: number of axes with array indexing
+        x_ = x_.transpose(*transp)
+        idx_ = ([slice(None)] * p + nidx[p:])
+        # flatten the array-indexed dimensions
+        x_flat = x_.reshape((np.prod(x_.shape[0: p]),) + x_.shape[p:])
+        # process y so that last axes are the same
+        if y.shape != (1,):
+            y_shape_reverse = []
+            for x_s, y_s in zip(x_flat.shape[::-1], y.shape[::-1]):
+                if x_s == y_s or y_s == 1:
+                    y_shape_reverse.append(y_s)
+                else:
+                    break
+            if np.prod(y_shape_reverse) < np.prod(y.shape):
+                if len(y_shape_reverse) > 0:
+                    y_shape_reverse.append(
+                        int(np.prod(y.shape[0:-len(y_shape_reverse)])))
+                else:
+                    y_shape_reverse.append(int(np.prod(y.shape)))
+            y_shape = y_shape_reverse[::-1]
+            y_flat = y.reshape(y_shape)
+        else:
+            y_flat = y[0]
+        # build the strides
+        strides = [1]
+        for i in range(p - 1, 0, -1):
+            stride = x_.shape[i] * strides[0]
+            strides.insert(0, stride)
+        # build the indices and use it
+        index = idx_[p:] + [slice(None)] * (len(x_flat.shape) - len(idx_[p:]) - 1)
+        take_idx = sum(i * s for i, s in zip(nidx, strides))
+        if index == []:
+            for j, i in enumerate(take_idx.flatten()):
+                if y_flat.shape == ():
+                    val = y_flat
+                else:
+                    val = y_flat[j]
+                tmp = pygpu.elemwise.elemwise2(
+                    x_flat[i], '+', val, x_flat[i],
+                    broadcast=True,
+                    convert_f16=True
+                )
+                x_flat.__setitem__(i, tmp)
+        else:
+            k = get_iadd(node.inputs[0], node.inputs[1])
+            if x_flat.shape[-len(y_flat.shape):] == y_flat.shape or y_flat.shape == ():
+                # y_flat has to be broadcast over axes of x_flat[i]
+                for i in take_idx.flatten():
+                    if len(idx_[p:]) > 0:
+                        x_flat_sub = x_flat[i].__getitem__(index)
+                    else:
+                        x_flat_sub = x_flat[i]
+                    tmp = pygpu.elemwise.elemwise2(
+                        x_flat_sub, '+', y_flat, x_flat_sub,
+                        broadcast=True,
+                        convert_f16=True
+                    )
+                    x_flat[i].__setitem__(index, tmp)
+            else:
+                # y_flat's first axis corresponds to first exist of x_flat
+                for j, i in enumerate(take_idx.flatten()):
+                    if len(idx_[p:]) > 0:
+                        x_flat_sub = x_flat[i].__getitem__(index)
+                    else:
+                        x_flat_sub = x_flat[i]
+                    k(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
+        x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
+        out[0] = x_
 class GpuAdvancedIncSubtensor1(Op):
    """
    Implement AdvancedIncSubtensor1 on the gpu.

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -13,6 +13,7 @@ from ..elemwise import GpuDimShuffle
 from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
                         GpuAdvancedSubtensor1,
                         GpuAdvancedSubtensor,
+                         GpuAdvancedIncSubtensor,
                         GpuAdvancedIncSubtensor1,
                         GpuAdvancedIncSubtensor1_dev20,
                         GpuExtractDiag,
@@ -159,6 +160,7 @@ def test_advinc_subtensor1_vector_scalar():
                          name='y')
        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
        f = theano.function([y], expr, mode=mode_with_gpu)
        assert sum([isinstance(node.op, (GpuAdvancedIncSubtensor1_dev20,
                                         GpuAdvancedIncSubtensor1))
                    for node in f.maker.fgraph.toposort()]) == 1
@@ -222,6 +224,7 @@ class G_advancedsubtensor(test_subtensor.TestAdvancedSubtensor):
            self, name,
            shared=gpuarray_shared_constructor,
            sub=GpuAdvancedSubtensor,
+            inc_sub=GpuAdvancedIncSubtensor,
            mode=mode_with_gpu,
            # avoid errors with limited devices
            dtype='float32',  # floatX?

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -2482,16 +2482,12 @@ class TestAlloc(unittest.TestCase):
            grad_derp = theano.grad(derp, some_vector)
            fgrad = theano.function([some_vector], grad_derp,
                                    mode=self.mode)
            topo_obj = fobj.maker.fgraph.toposort()
-            # <= is needed as the GPU currently don't implement
-            # AdvancedIncSubtensor. When this is the case it can be
-            # replaced with ==.
            assert np.sum([isinstance(node.op, type(alloc_))
-                           for node in topo_obj]) <= 1
+                           for node in topo_obj]) == 0
-            topo_grad = fgrad.maker.fgraph.toposort()
-            # print subtensor
+            topo_grad = fgrad.maker.fgraph.toposort()
-            # theano.printing.debugprint(fgrad)
            assert np.sum([isinstance(node.op, type(alloc_))
                           for node in topo_grad]) == n_alloc, (
                               alloc_, subtensor, n_alloc, topo_grad)

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -1386,6 +1386,44 @@ class TestAdvancedSubtensor(unittest.TestCase):
        self.ix2 = lmatrix()
        self.ixr = lrow()
+    def test_advinc_subtensor(self):
+        x_shp = (20, 15, 10, 5)
+        def check(idx, y_val, x_val, true):
+            x = self.shared(x_val, name='x')
+            y = tensor.tensor(dtype='float32',
+                              broadcastable=(False,) * len(y_val.shape),
+                              name='y')
+            sym_idx = [tensor.as_tensor_variable(ix) for ix in idx]
+            expr = tensor.advanced_inc_subtensor(x, y, *sym_idx)
+            f = theano.function([y], expr, mode=self.mode)
+            rval = f(y_val)
+            assert np.allclose(rval, true)
+        idxs_y_shp_pairs = [
+            ((0, [1, 3, 5], 1), (3, 5)),
+            (([1, 2, 4, 8],), (4, 15, 10, 5)),
+            (([0, 1, 2], 0, [0, 1, 2]), (3, 3, 5)),
+            (([[0, 1], [2, 3]], [[0, 1], [2, 3]]), (2, 2, 10, 5)),
+        ]
+        for idx, y_shps in idxs_y_shp_pairs:
+            for i in range(len(y_shps) - 1):
+                y_shp = y_shps[i:]
+                x_val = np.arange(np.prod(x_shp), dtype='float32').reshape(x_shp) + 1
+                y_val = np.arange(np.prod(y_shp), dtype='float32').reshape(y_shp) + 1
+                rep = x_val.copy()
+                try:
+                    rep[idx] += y_val
+                except ValueError:
+                    continue
+                check(idx, y_val, x_val, rep)
+            x_val = np.arange(np.prod(x_shp), dtype='float32').reshape(x_shp) + 1
+            y_val = np.array(1).astype(np.float32)
+            rep = x_val.copy()
+            rep[idx] += y_val
+            check(idx, y_val, x_val, rep)
    def eval_output_and_check(self, t):
        f = inplace_func([], t, mode=self.mode)
        topo = f.maker.fgraph.toposort()