[BUG] fix 2 bug in the new faster GpuAdvancedIncSubtensor1

It was always running inplace and it was alwayr considering the input as 2d.

[BUG] fix 2 bug in the new faster GpuAdvancedIncSubtensor1
a240803c · Frederic · 32e5fa85 · a240803c · a240803c
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2451,13 +2451,14 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        out[0] = x

    def c_code_cache_version(self):
-        return (1,)
+        return (2,)

    def c_code(self, node, name, inputs, outputs, sub):
        active_device_no = theano.sandbox.cuda.active_device_number()
-        compute_capability =  theano.sandbox.cuda.device_properties(active_device_no)['major']
+        compute_capability =  device_properties(active_device_no)['major']
        if (self.set_instead_of_inc) or \
           (node.inputs[0].ndim != node.inputs[1].ndim) or \
+           (node.inputs[0].ndim != 2) or \
           (compute_capability < 2):
             raise NotImplementedError("This case does not have C code yet.")

@@ -2477,12 +2478,12 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
            Py_XINCREF(%(out)s);
        }

-        CudaNdarray_vector_add_fast(%(x)s, %(y)s, %(ind)s);
+        CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);

        if (!%(out)s) {
            %(fail)s
        }
-        """ %locals()
+        """ % locals()

    def c_support_code_apply(self, node, nodename):
        return """

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -999,20 +999,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):

 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
-    shared = cuda.shared_constructor
-    #shared = tensor.shared
-    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                      dtype='float32')
-    yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
-                      dtype='float32')
-    x = shared(xval, name='x')
-    y = T.fmatrices('y')
-    expr = T.advanced_inc_subtensor1(x, y, [0, 2])
-    f = theano.function([y], expr, mode=mode_with_gpu)
-    assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
-                for node in f.maker.fgraph.toposort()]) == 1
-    assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
-                                    [17., 18., 19.]])
+    for shp in [(3, 3), (3, 3, 3)]:
+        shared = cuda.shared_constructor
+        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
+        yval = numpy.empty((2,) + shp[1:], dtype='float32')
+        yval[:] = 10
+        x = shared(xval, name='x')
+        y = T.tensor(dtype='float32',
+                     broadcastable=(False,) * len(shp),
+                     name='y')
+        expr = T.advanced_inc_subtensor1(x, y, [0, 2])
+        f = theano.function([y], expr, mode=mode_with_gpu)
+        assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
+                    for node in f.maker.fgraph.toposort()]) == 1
+        rval = f(yval)
+        rep = xval.copy()
+        rep[[0, 2]] += yval
+        assert numpy.allclose(rval, rep)


 def test_inc_subtensor():