Merge pull request #6220 from lamblin/fix_gpuadvidx

Updates in GPU indexing

Merge pull request #6220 from lamblin/fix_gpuadvidx
b7b5dd39 · Frédéric Bastien · GitHub · dae1f236 · e4c39c23 · b7b5dd39
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1106,9 +1106,11 @@ def local_gpua_advanced_incsubtensor1(op, context_name, inputs, outputs):
            set_instead_of_inc=set_instead_of_inc)
-@register_opt('fast_compile')
+# Do not register this optimization for now, as it slows down the
-@op_lifter([tensor.AdvancedIncSubtensor])
+# execution by a lot in important cases.
-@register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile')
+# @register_opt('fast_compile')
+# @op_lifter([tensor.AdvancedIncSubtensor])
+# @register_opt2([tensor.AdvancedIncSubtensor], 'fast_compile')
 def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
    if not op.set_instead_of_inc:
        return GpuAdvancedIncSubtensor()

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -621,6 +621,10 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
        y = inp[1]
        idx = inp[2:]
        x = x.copy()
+        # Get a handle to the GpuElemwise object that will be called.
+        # It is not necessary to have the right number of dimensions,
+        # so we just pass symbolic x and y.
+        iadd = get_iadd(node.inputs[0], node.inputs[1])
        # convert all indices to np.array
        for i in range(len(idx)):
@@ -699,15 +703,10 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
                else:
                    val = y_flat[j]
-                tmp = pygpu.elemwise.elemwise2(
+                iadd(x_flat[i], val, broadcast=True)
-                    x_flat[i], '+', val, x_flat[i],
-                    broadcast=True,
-                    convert_f16=True
-                )
-                x_flat.__setitem__(i, tmp)
        else:
-            k = get_iadd(node.inputs[0], node.inputs[1])
+            if (x_flat.shape[-len(y_flat.shape):] == y_flat.shape or
-            if x_flat.shape[-len(y_flat.shape):] == y_flat.shape or y_flat.shape == ():
+                    y_flat.shape == ()):
                # y_flat has to be broadcast over axes of x_flat[i]
                for i in take_idx.flatten():
@@ -715,13 +714,7 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
                        x_flat_sub = x_flat[i].__getitem__(index)
                    else:
                        x_flat_sub = x_flat[i]
-                    tmp = pygpu.elemwise.elemwise2(
+                    iadd(x_flat_sub, y_flat, broadcast=True)
-                        x_flat_sub, '+', y_flat, x_flat_sub,
-                        broadcast=True,
-                        convert_f16=True
-                    )
-                    x_flat[i].__setitem__(index, tmp)
            else:
                # y_flat's first axis corresponds to first exist of x_flat
                for j, i in enumerate(take_idx.flatten()):
@@ -729,7 +722,7 @@ class GpuAdvancedIncSubtensor(HideC, tensor.AdvancedIncSubtensor):
                        x_flat_sub = x_flat[i].__getitem__(index)
                    else:
                        x_flat_sub = x_flat[i]
-                    k(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
+                    iadd(x_flat_sub, y_flat[j % y_flat.shape[0]], broadcast=True)
        x_ = x_flat.reshape(x_.shape).transpose(*rtransp)
        out[0] = x_

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -340,8 +340,8 @@ class test_gpuallocdiag(unittest.TestCase):
        grad_x = tensor.grad(sum_mtx_x, x)
        grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x)
-        fn_grad_x = theano.function([x], grad_x)
+        fn_grad_x = theano.function([x], grad_x, mode=mode_with_gpu)
-        fn_grad_mtx_x = theano.function([x], grad_mtx_x)
+        fn_grad_mtx_x = theano.function([x], grad_mtx_x, mode=mode_with_gpu)
        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
@@ -354,8 +354,8 @@ class test_gpuallocdiag(unittest.TestCase):
        grad_x = tensor.grad(sum_mtx_x, x)
        grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x)
-        fn_grad_x = theano.function([x], grad_x)
+        fn_grad_x = theano.function([x], grad_x, mode=mode_with_gpu)
-        fn_grad_mtx_x = theano.function([x], grad_mtx_x)
+        fn_grad_mtx_x = theano.function([x], grad_mtx_x, mode=mode_with_gpu)
        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)
@@ -368,8 +368,8 @@ class test_gpuallocdiag(unittest.TestCase):
        grad_x = tensor.grad(sum_mtx_x, x)
        grad_mtx_x = tensor.grad(sum_mtx_x, mtx_x)
-        fn_grad_x = theano.function([x], grad_x)
+        fn_grad_x = theano.function([x], grad_x, mode=mode_with_gpu)
-        fn_grad_mtx_x = theano.function([x], grad_mtx_x)
+        fn_grad_mtx_x = theano.function([x], grad_mtx_x, mode=mode_with_gpu)
        computed_grad_x = fn_grad_x(np_x)
        computed_grad_mtx_x = fn_grad_mtx_x(np_x)

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -396,7 +396,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        s1 = s[newaxis]
        assert s1.broadcastable == (True,), s1
-        vs1, vn3, vn4 = theano.function([s], [s1, n3, n4])(-2.0)
+        vs1, vn3, vn4 = theano.function([s], [s1, n3, n4], mode=self.mode)(-2.0)
        assert np.all(vs1 == [-2.0])
        assert np.all(vn3 ==
@@ -962,12 +962,14 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        utt.verify_grad(
            inc_slice(slice(1, 2, None), slice(None, None, None)),
            (np.asarray([[0, 1], [2, 3], [4, 5.]]),
-             np.asarray([[9, 9.]]),))
+             np.asarray([[9, 9.]]),),
+            mode=self.mode)
        # single element
        utt.verify_grad(
            inc_slice(2, 1),
-            (np.asarray([[0, 1], [2, 3], [4, 5.]]), np.asarray(9.),))
+            (np.asarray([[0, 1], [2, 3], [4, 5.]]), np.asarray(9.),),
+            mode=self.mode)
    def test_inc_and_set_subtensor(self):
        """
@@ -1142,7 +1144,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        m1 = set_subtensor(m[:, i], 0)
        m2 = inc_subtensor(m[:, i], 1)
-        f = theano.function([m, i], [m1, m2])
+        f = theano.function([m, i], [m1, m2], mode=self.mode)
        m_val = rand(3, 5)
        i_val = randint_ranged(min=0, max=4, shape=(4,))
@@ -1167,7 +1169,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
        m1 = set_subtensor(m[:, i], 0)
        m2 = inc_subtensor(m[:, i], 1)
-        f = theano.function([m, i], [m1, m2])
+        f = theano.function([m, i], [m1, m2], mode=self.mode)
        m_val = rand(5, 7)
        i_val = randint_ranged(min=0, max=6, shape=(4, 2))
@@ -1202,7 +1204,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                sub_m = m[:, i]
                m1 = set_subtensor(sub_m, np.zeros(shp_v))
                m2 = inc_subtensor(sub_m, np.ones(shp_v))
-                f = theano.function([m, i], [m1, m2])
+                f = theano.function([m, i], [m1, m2], mode=self.mode)
                m_val = rand(3, 5)
                i_val = randint_ranged(min=0, max=4, shape=shp_i)
@@ -1239,7 +1241,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
                sub_m = m[:, i]
                m1 = set_subtensor(sub_m, np.zeros(shp_v))
                m2 = inc_subtensor(sub_m, np.ones(shp_v))
-                f = theano.function([m, i], [m1, m2])
+                f = theano.function([m, i], [m1, m2], mode=self.mode)
                m_val = rand(3, 5)
                i_val = randint_ranged(min=0, max=4, shape=shp_i)
@@ -1261,7 +1263,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
    def test_take(self):
        a = tensor.matrix()
-        f = theano.function([a], a.take(0, axis=-1), allow_input_downcast=True)
+        f = theano.function(
+            [a], a.take(0, axis=-1),
+            allow_input_downcast=True, mode=self.mode)
        f(np.random.normal(0, 1, (30, 4)))