Merge pull request #2 from nouiz/viveksck-try_nouiz

Bugfix and reuse old code when the new one isn't used

Merge pull request #2 from nouiz/viveksck-try_nouiz
305b11ae · Vivek Kulkarni · 32e5fa85 · 69267c88 · 305b11ae · 305b11ae
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2308,6 +2308,7 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
            return ()
        return (3, hv)
 class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
@@ -2391,14 +2392,6 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        x_ = as_cuda_ndarray_variable(x)
        y_ = as_cuda_ndarray_variable(y)
        ilist_ = tensor.as_tensor_variable(ilist)
-        convert_map = { 8:tensor.basic._convert_to_int8, 
-                       16:tensor.basic._convert_to_int16, 
-                       32:tensor.basic._convert_to_int32, 
-                       64:tensor.basic._convert_to_int64
-                      }
-        intwidth = theano.gof.compiledir.python_int_bitwidth()
-        ilist_ = convert_map[intwidth](ilist_)
        assert x_.type.dtype == y_.type.dtype
        assert x_.type.ndim >= y_.type.ndim
@@ -2451,15 +2444,12 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        out[0] = x
    def c_code_cache_version(self):
-        return (1,)
+        return (3,)
    def c_code(self, node, name, inputs, outputs, sub):
-        active_device_no = theano.sandbox.cuda.active_device_number()
-        compute_capability =  theano.sandbox.cuda.device_properties(active_device_no)['major']
        if (self.set_instead_of_inc) or \
-           (node.inputs[0].ndim != node.inputs[1].ndim) or \
+           (node.inputs[0].ndim != node.inputs[1].ndim):
-           (compute_capability < 2):
+            raise NotImplementedError("This case does not have C code yet.")
-             raise NotImplementedError("This case does not have C code yet.")
        x = inputs[0]
        y = inputs[1]
@@ -2469,6 +2459,19 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
        inplace = int(self.inplace)
        return """
+        PyObject *x_obj, *y_obj, *row_x, *row_y;
+        PyObject *x_rowind_obj, *y_rowind_obj;
+        dtype_%(ind)s *p_index;
+        int num_indices, j;
+        int ret;
+        num_indices = PyArray_SIZE(%(ind)s);
+        if ((num_indices - 1) > LONG_MAX) {
+            PyErr_Format(PyExc_AssertionError,
+                         "num_indices %%d exceeds LONG_MAX + 1", num_indices);
+            %(fail)s;
+        }
        Py_XDECREF(%(out)s);
        if (!%(inplace)s) {
            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
@@ -2477,12 +2480,136 @@ class GpuAdvancedIncSubtensor1(tensor.AdvancedIncSubtensor1, GpuOp):
            Py_XINCREF(%(out)s);
        }
-        CudaNdarray_vector_add_fast(%(x)s, %(y)s, %(ind)s);
+        x_obj = (PyObject*)CudaNdarray_View(%(out)s);
+        y_obj = (PyObject*)CudaNdarray_View(%(y)s);
+        for (j = 0;j < num_indices; j++) {
+             p_index = (dtype_%(ind)s *)PyArray_GETPTR1(%(ind)s, j);
+             x_rowind_obj = PyInt_FromLong(*p_index);
+             if (PyInt_AsLong(x_rowind_obj) != (*p_index)) {
+                 PyErr_Format(PyExc_AssertionError,
+                              "Error in converting row index to integer from long");
+                 // Dec Ref what ever we have increfed or allocated so far
+                 // We deallocate objects exactly in the reverse order they were allocated.
+                 Py_XDECREF(x_rowind_obj);
+                 Py_XDECREF(y_obj);
+                 Py_XDECREF(x_obj);
+                 %(fail)s;
+             }
+             y_rowind_obj = PyInt_FromLong(j);
+             row_x = CudaNdarray_Subscript(x_obj, x_rowind_obj);
+             row_y = CudaNdarray_Subscript(y_obj, y_rowind_obj);
+             if ((row_x == NULL) || (row_y == NULL)) {
+                  Py_XDECREF(row_y);
+                  Py_XDECREF(row_x);
+                  Py_XDECREF(y_rowind_obj);
+                  Py_XDECREF(x_rowind_obj);
+                  Py_XDECREF(y_obj);
+                  Py_XDECREF(x_obj);
+                  %(fail)s;
+             }
+             ret = CudaNdarray_inplace_elemwise(row_x, row_y, IADD);
+             if (ret != 0) {
+                 Py_XDECREF(row_y);
+                 Py_XDECREF(row_x);
+                 Py_XDECREF(y_rowind_obj);
+                 Py_XDECREF(x_rowind_obj);
+                 Py_XDECREF(y_obj);
+                 Py_XDECREF(x_obj);
+                 %(fail)s;
+             }
+             Py_XDECREF(row_y);
+             Py_XDECREF(row_x);
+             Py_XDECREF(y_rowind_obj);
+             Py_XDECREF(x_rowind_obj);
+        }
+        Py_XDECREF(y_obj);
+        Py_XDECREF(x_obj);
        if (!%(out)s) {
            %(fail)s
        }
-        """ %locals()
+        """ % locals()
+class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
+    """Implement AdvancedIncSubtensor1 on the gpu, but use function
+    only avail on compute capability 2.0 and more recent.
+    """
+    def make_node(self, x, y, ilist):
+        """It defer from GpuAdvancedIncSubtensor1 in that it make sure
+        the index are of type long.
+        """
+        x_ = as_cuda_ndarray_variable(x)
+        y_ = as_cuda_ndarray_variable(y)
+        ilist_ = tensor.as_tensor_variable(ilist)
+        convert_map = {8: tensor.basic._convert_to_int8,
+                       16: tensor.basic._convert_to_int16,
+                       32: tensor.basic._convert_to_int32,
+                       64: tensor.basic._convert_to_int64
+        }
+        intwidth = theano.gof.compiledir.python_int_bitwidth()
+        ilist_ = convert_map[intwidth](ilist_)
+        assert x_.type.dtype == y_.type.dtype
+        assert x_.type.ndim >= y_.type.ndim
+        if ilist_.type.dtype[:3] not in ('int', 'uin'):
+            raise TypeError('index must be integers')
+        if ilist_.type.broadcastable != (False,):
+            raise TypeError('index must be vector')
+        if x_.type.ndim == 0:
+            raise TypeError('cannot index into a scalar')
+        if x_.type.broadcastable[0]:
+            # the caller should have made a copy of x len(ilist) times
+            raise TypeError('cannot index into a broadcastable dimension')
+        return Apply(self, [x_, y_, ilist_], [x_.type()])
+    def c_code_cache_version(self):
+        return (2,)
+    def c_code(self, node, name, inputs, outputs, sub):
+        active_device_no = theano.sandbox.cuda.active_device_number()
+        compute_capability = device_properties(active_device_no)['major']
+        if ((self.set_instead_of_inc) or
+            (node.inputs[0].ndim != node.inputs[1].ndim) or
+            (node.inputs[0].ndim != 2) or
+            (compute_capability < 2)):
+            raise NotImplementedError("This case does not have C code yet.")
+        x = inputs[0]
+        y = inputs[1]
+        ind = inputs[2]
+        out = outputs[0]
+        fail = sub['fail']
+        inplace = int(self.inplace)
+        return """
+        Py_XDECREF(%(out)s);
+        if (!%(inplace)s) {
+            %(out)s = (CudaNdarray*)CudaNdarray_Copy(%(x)s);
+        } else {
+            %(out)s = %(x)s;
+            Py_XINCREF(%(out)s);
+        }
+        CudaNdarray_vector_add_fast(%(out)s, %(y)s, %(ind)s);
+        if (!%(out)s) {
+            %(fail)s
+        }
+        """ % locals()
    def c_support_code_apply(self, node, nodename):
        return """

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -776,9 +776,16 @@ def local_gpu_advanced_incsubtensor1(node):
                    'either set the `warn.gpu_set_subtensor1` config '
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
+            active_device_no = theano.sandbox.cuda.active_device_number()
-            gpu_op = GpuAdvancedIncSubtensor1(
+            compute_capability = device_properties(active_device_no)['major']
-                set_instead_of_inc=set_instead_of_inc)
+            if (compute_capability < 2 or
+                x.ndim != 2 or
+                y.ndim != 2):
+                gpu_op = GpuAdvancedIncSubtensor1(
+                    set_instead_of_inc=set_instead_of_inc)
+            else:
+                gpu_op = GpuAdvancedIncSubtensor1_dev20(
+                    set_instead_of_inc=set_instead_of_inc)
            return [gpu_op(gpu_from_host(x), gpu_from_host(y), *coords)]
    # Should not execute for GpuAdvancedIncSubtensor1
@@ -809,8 +816,16 @@ def local_gpu_advanced_incsubtensor1(node):
                    'option to False, or `warn.ignore_bug_before` to at '
                    'least \'0.6\'.', stacklevel=1)
-            gpu_op = GpuAdvancedIncSubtensor1(
+            active_device_no = theano.sandbox.cuda.active_device_number()
-                set_instead_of_inc=set_instead_of_inc)
+            compute_capability = device_properties(active_device_no)['major']
+            if (compute_capability < 2 or
+                x.ndim != 2 or
+                y.ndim != 2):
+                gpu_op = GpuAdvancedIncSubtensor1(
+                    set_instead_of_inc=set_instead_of_inc)
+            else:
+                gpu_op = GpuAdvancedIncSubtensor1_dev20(
+                    set_instead_of_inc=set_instead_of_inc)
            return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
    return False

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -999,20 +999,23 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
 def test_advinc_subtensor1():
    """ Test the second case in the opt local_gpu_advanced_incsubtensor1 """
-    shared = cuda.shared_constructor
+    for shp in [(3, 3), (3, 3, 3)]:
-    #shared = tensor.shared
+        shared = cuda.shared_constructor
-    xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
-                      dtype='float32')
+        yval = numpy.empty((2,) + shp[1:], dtype='float32')
-    yval = numpy.asarray([[10, 10, 10], [10, 10, 10]],
+        yval[:] = 10
-                      dtype='float32')
+        x = shared(xval, name='x')
-    x = shared(xval, name='x')
+        y = T.tensor(dtype='float32',
-    y = T.fmatrices('y')
+                     broadcastable=(False,) * len(shp),
-    expr = T.advanced_inc_subtensor1(x, y, [0, 2])
+                     name='y')
-    f = theano.function([y], expr, mode=mode_with_gpu)
+        expr = T.advanced_inc_subtensor1(x, y, [0, 2])
-    assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
+        f = theano.function([y], expr, mode=mode_with_gpu)
-                for node in f.maker.fgraph.toposort()]) == 1
+        assert sum([isinstance(node.op, cuda.GpuAdvancedIncSubtensor1)
-    assert numpy.allclose(f(yval), [[11., 12., 13.], [4., 5., 6.],
+                    for node in f.maker.fgraph.toposort()]) == 1
-                                    [17., 18., 19.]])
+        rval = f(yval)
+        rep = xval.copy()
+        rep[[0, 2]] += yval
+        assert numpy.allclose(rval, rep)
 def test_inc_subtensor():