Merge pull request #5358 from nouiz/adv_incsub1

Use GpuAdvancedIncSubtensor1_dev20 more frequently in the new back-end

Merge pull request #5358 from nouiz/adv_incsub1
280a5f4f · Pascal Lamblin · GitHub · 6315fdfa · d0043caf · 280a5f4f
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -1036,9 +1036,13 @@ def local_gpua_advanced_incsubtensor(op, context_name, inputs, outputs):
    set_instead_of_inc = op.set_instead_of_inc
    compute_capability = int(context.bin_id[-2])
+    if compute_capability >= 2 and x.ndim == 1 and y.ndim == 0:
-    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2 or
+        x = x.dimshuffle(0, 'x')
-            x.type.dtype != y.type.dtype):
+        y = y.dimshuffle('x', 'x')
+        ret = GpuAdvancedIncSubtensor1_dev20(
+            set_instead_of_inc=set_instead_of_inc)(x, y, ilist).dimshuffle(0)
+        return ret
+    elif compute_capability < 2 or x.ndim != 2 or y.ndim != 2:
        return GpuAdvancedIncSubtensor1(
            set_instead_of_inc=set_instead_of_inc)
    else:

--- a/theano/gpuarray/subtensor.py
+++ b/theano/gpuarray/subtensor.py
@@ -803,7 +803,6 @@ class GpuAdvancedIncSubtensor1_dev20(GpuKernelBase, HideC,
        y_ = as_gpuarray_variable(y, ctx_name)
        ilist_ = as_gpuarray_variable(ilist, ctx_name)
-        assert x_.type.dtype == y_.type.dtype
        assert x_.type.ndim >= y_.type.ndim
        if ilist_.type.dtype not in tensor.integer_dtypes:

--- a/theano/gpuarray/tests/test_subtensor.py
+++ b/theano/gpuarray/tests/test_subtensor.py
@@ -13,6 +13,7 @@ from ..subtensor import (GpuIncSubtensor, GpuSubtensor,
                         GpuAdvancedSubtensor1,
                         GpuAdvancedSubtensor,
                         GpuAdvancedIncSubtensor1,
+                         GpuAdvancedIncSubtensor1_dev20,
                         GpuDiagonal)
 from ..type import gpuarray_shared_constructor
@@ -63,6 +64,49 @@ def test_advinc_subtensor1():
        assert numpy.allclose(rval, rep)
+def test_advinc_subtensor1_dtype():
+    # Test the mixed dtype case
+    shp = (3, 4)
+    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
+        shared = gpuarray_shared_constructor
+        xval = numpy.arange(numpy.prod(shp), dtype=dtype1).reshape(shp) + 1
+        yval = numpy.empty((2,) + shp[1:], dtype=dtype2)
+        yval[:] = 10
+        x = shared(xval, name='x')
+        y = tensor.tensor(dtype=yval.dtype,
+                          broadcastable=(False,) * len(yval.shape),
+                          name='y')
+        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
+        f = theano.function([y], expr, mode=mode_with_gpu)
+        assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1_dev20)
+                    for node in f.maker.fgraph.toposort()]) == 1
+        rval = f(yval)
+        rep = xval.copy()
+        rep[[0, 2]] += yval
+        assert numpy.allclose(rval, rep)
+def test_advinc_subtensor1_vector_scalar():
+    # Test the case where x is a vector and y a scalar
+    shp = (3,)
+    for dtype1, dtype2 in [('float32', 'int8'), ('float32', 'float64')]:
+        shared = gpuarray_shared_constructor
+        xval = numpy.arange(numpy.prod(shp), dtype=dtype1).reshape(shp) + 1
+        yval = numpy.asarray(10, dtype=dtype2)
+        x = shared(xval, name='x')
+        y = tensor.tensor(dtype=yval.dtype,
+                          broadcastable=(False,) * len(yval.shape),
+                          name='y')
+        expr = tensor.advanced_inc_subtensor1(x, y, [0, 2])
+        f = theano.function([y], expr, mode=mode_with_gpu)
+        assert sum([isinstance(node.op, GpuAdvancedIncSubtensor1_dev20)
+                    for node in f.maker.fgraph.toposort()]) == 1
+        rval = f(yval)
+        rep = xval.copy()
+        rep[[0, 2]] += yval
+        assert numpy.allclose(rval, rep)
 def test_incsub_f16():
    shp = (3, 3)
    shared = gpuarray_shared_constructor