Corrected check for whether cuda is active or not and compile only one addition…

Corrected check for whether cuda is active or not and compile only one addition kernel instead of many in the perform() method.

Corrected check for whether cuda is active or not and compile only one addition…
21c11dc1 · Pierre Luc Carrier · 83eeda3a · 21c11dc1 · 21c11dc1 · 21c11dc1
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
 import copy
 import theano
 import numpy
+
+try:
+    import pygpu
+except ImportError:
+    pass
+
 from theano import tensor, scalar, gof
 from theano.compile import optdb
 from theano.gof import (local_optimizer, EquilibriumDB,
@@ -279,6 +285,11 @@ def local_gpua_incsubtensor(node):
 @register_opt()
 @op_lifter([tensor.AdvancedIncSubtensor1])
 def local_gpua_advanced_incsubtensor(node):
+    
+    # This optimization is disabled if cuda is not active
+    if pygpu.get_default_context().kind != "cuda":
+        return None
+    
    x, y = node.inputs[0:2]
    coords = node.inputs[2:]
    set_instead_of_inc = node.op.set_instead_of_inc

--- a/theano/sandbox/gpuarray/subtensor.py
+++ b/theano/sandbox/gpuarray/subtensor.py
@@ -359,7 +359,7 @@ class GpuIncSubtensor(IncSubtensor):
        return parent_version + elemwise_version + (0,)


-class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
+class GpuAdvancedIncSubtensor1(HideC, tensor.AdvancedIncSubtensor1):
    """
    Implement AdvancedIncSubtensor1 on the gpu.
    """
@@ -383,8 +383,15 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):

        return gof.Apply(self, [x_, y_, ilist_], [x_.type()])

-    # CudaNdarray_Subscript() doesn't support Advanced slicing.
-    # But we can't use the parent version that loops on each index
+    def getInplElemwiseAdditionKernel(self, a, b):
+        a_arg = pygpu.tools.as_argument(a, 'a')
+        b_arg = pygpu.tools.as_argument(b, 'b')
+        args = [a_arg, b_arg]
+        oper = "a[i] = a[i] + %(b)s" % {'b': b_arg.expr()}
+        k = pygpu.elemwise.ElemwiseKernel(a.context, args, oper)
+        return k
+
+    # We can't use the parent version that loops on each index
    # as we also need to loop when set_instead_of_inc is True and the
    # parent doesn't loop in that case.
    def perform(self, node, inp, out_):
@@ -413,18 +420,26 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, Op):
            # jointly on `x` and `y`. Otherwise, it means `y` should be
            # broadcasted to fill all relevant rows of `x`.
            assert y.ndim <= x.ndim   # Should be guaranteed by `make_node`
-            if y.ndim == x.ndim:
+
+            if len(idx) == 0:
+                pass
+            elif y.ndim == x.ndim:
                assert len(y) == len(idx)
+
+                firstIdxY, firstIdxX = enumerate(idx).next()
+                k = self.getInplElemwiseAdditionKernel(x[firstIdxX],
+                                                       y[firstIdxY])                 
+
                for (j, i) in enumerate(idx):
-                    #x[i] += y[j]
-                    pygpu.elemwise.ielemwise2(x[i], '+', y[j],  broadcast=False)
+                    k(x[i], y[j], broadcast=False)
            else:
+                nb_dims_to_add = (x[idx[0]].ndim - y.ndim)
+                reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
+                k = self.getInplElemwiseAdditionKernel(x[0],
+                                                       reshaped_y)
+
                for i in idx:
-                    #x[i] += y
-                    nb_dims_to_add = (x[i].ndim - y.ndim)
-                    reshaped_y = y.reshape((1,)*nb_dims_to_add + y.shape)
-                    pygpu.elemwise.ielemwise2(x[i], '+', reshaped_y,
-                                              broadcast=True)
+                    k(x[i], reshaped_y, broadcast=True)

        out[0] = x


--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -431,7 +431,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            self.assertTrue(numpy.allclose(val, good), (val, good))

            # Test reuse of output memory
-            if isinstance(self.adv_sub1, tensor.AdvancedSubtensor1):
+            if type(self.adv_sub1) == tensor.AdvancedSubtensor1:
                op = self.adv_sub1()
                # When idx is a TensorConstant.
                if hasattr(idx, "data"):