Many fix following the code review.

- add/update comments - rename assert_fast - fix crash - remove useless code This do change anything related to the int casted to float32.

Many fix following the code review.
bd00e506 · Frederic · 6c955ecf · bd00e506 · bd00e506 · bd00e506
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1891,7 +1891,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    """
    Implement AdvancedSubtensor1 on the gpu.
    """
-    assert_fast = None
+    perform_using_take = None

    def make_node(self, x, ilist):
        x_ = as_cuda_ndarray_variable(x)
@@ -1910,30 +1910,29 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
        #super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
        x, idx = inp
        out, = out_
-        new_method = True
        #TODO: if more then 3 dims, reshape the inputs if it is contiguous.
        x_orig = x
        if x.ndim > 3 and x.is_c_contiguous():
            x = x.reshape((x.shape[0], numpy.prod(x.shape[1:])))
+        out_shape = (len(idx),) + x_orig.shape[1:]
        if x.ndim <= 3:
-            if self.assert_fast is not None:
-                assert self.assert_fast == True, (
+            # CudaNdarray.take only supports ndim <= 3
+            if self.perform_using_take is not None:
+                assert self.perform_using_take == True, (
                    "GpuAdvancedSubtensor1 used the fast version")

-            # Support x with dimensions 1,2,3 only.
-            o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")),
-                       0, out_[0][0])  # idx, axis, return[, clipmode]
+            o = x.take(cuda_ndarray.cuda_ndarray.CudaNdarray(idx.astype("float32")),  # idx
+                       0,  # axis
+                       out_[0][0])  # return
            if x is not x_orig:
-                o = o.reshape((len(idx),) + x_orig.shape[1:])
+                o = o.reshape(out_shape)
            out[0] = o
        else:
-            if self.assert_fast is not None:
-                assert self.assert_fast == False, (
-                    "GpuAdvancedSubtensor1 didn't used the fast version")
-            if (out_[0][0] is None or out_[0][0].shape != (len(idx),) +
-                x.shape[1:]):
-                o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros((len(idx),) +
-                                                                x.shape[1:])
+            if self.perform_using_take is not None:
+                assert self.perform_using_take == False, (
+                    "GpuAdvancedSubtensor1 didn't use the fast version")
+            if out_[0][0] is None or out_[0][0].shape != out_shape:
+                o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
            else:
                o = out_[0][0]
            for (j, i) in enumerate(idx):

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -712,6 +712,10 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,
        if (idx<0)
            idx += dB0; // To allow negative indexing.
        if ((idx < 0) || (idx >= dB0))
+            // Any value other the 0 probably work. But to be more safe, I want
+            // to change all bits to prevent problem with concurrent write that
+            // could cross cache line. But this should not happen with the
+            // current code and driver.
            *err = 0xFFFF;
        for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
            for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
@@ -725,12 +729,13 @@ __global__ void k_take_3(const int d0, const int d1, const int d2,

 // Pointor to 1 int on the device
 // Used in CudaNdarray_TakeFrom to tell that there is an out of bound error
-// When it exist, it should always be 0
+// When it is allocated, it should always be 0
 // So if there is an error, we must reset it to 0 BEFORE we raise the error
 // This prevent us from setting it to 0 before each use
 static int* err_var = NULL;

-//PyObject* PyArray_TakeFrom(PyArrayObject* self, PyObject* indices, int axis, PyArrayObject* ret, NPY_CLIPMODE clipmode)
+// We try to be similat to the PyArray_TakeFrom function
+//http://docs.scipy.org/doc/numpy/reference/c-api.array.html
 //TODO: support other clip mode then raise(clip, wrap)
 //TODO: what if the indices take more then 32 bits?
 //self is the input that we copy data from.
@@ -820,7 +825,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    //TODO: implement the default and other axis
    PyObject * axis_iobj = PyNumber_Long(axis_obj);
    if (!axis_iobj) {
-        PyErr_SetString(PyExc_NotImplementedError,"CudaNdarray_TakeFrom: axis must be convertisable to a long");
+        PyErr_SetString(PyExc_NotImplementedError,"CudaNdarray_TakeFrom: axis must be convertable to a long");
        Py_DECREF(indices_obj);
        return NULL;
    }
@@ -849,12 +854,6 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
        }
    }
    if (!out) {
-        int total_elements = nb_indices;
-        for(int i=1;i<self->nd;i++)
-            total_elements*=CudaNdarray_HOST_DIMS(self)[i];
-        // total_elements now contains the size of the array, in reals
-        int total_size = total_elements * sizeof(real);
-
        out = (CudaNdarray*)CudaNdarray_New();
        if (!out){
            Py_DECREF(indices_obj);
@@ -939,7 +938,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            break;
        case 2:
            {
-                dim3 n_threads(CudaNdarray_HOST_DIMS(out)[1], 1, 1);
+                dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], 512), 1, 1);
                if (verbose)
                    printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
@@ -963,8 +962,9 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            break;
        case 3:
            {
-                dim3 n_threads(CudaNdarray_HOST_DIMS(out)[1],
-                               CudaNdarray_HOST_DIMS(out)[2], 1);
+                int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], 512);
+                int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], 512 / ty);
+                dim3 n_threads(tx, ty, 1);
                if (verbose)
                    printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
@@ -1003,6 +1003,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
        Py_DECREF(out);
        return NULL;
    }
+    //-10 could be any value different then 0.
    int cpu_err_var=-10;
                        
    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -850,11 +850,19 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
                     self).__init__(name)

    def test_adv_sub1_fast(self):
-        """ We check that we correctly used the fast version"""
+        """We check that the special cases of advanced indexing that
+        use CudaNdarrayTakeFrom are handled correctly
+
+        """
        rand = numpy.random.rand
+        # The variable fast is used to set the member perform_using_take of
+        # the Op.  It is only useful for testing that we use the fast
+        # version when we should. Users should not use it.
        for data, idx, fast in [(rand(70000), range(70000), True),
                                (rand(70000, 5), range(70000), True),
                                (rand(70000, 2, 3), range(70000), True),
+                                (rand(1025, 1025), [5, 10], True),
+                                (rand(3, 1025, 1026), [1, 2], True),
                                (rand(4, 5), [2, 3], True),
                                (rand(4, 2, 3), [0, 3], True),
                                (rand(4, 2, 3), [3, 3, 1, 1, 2,
@@ -872,7 +880,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):

            # Test with c_contiguous input
            t = self.adv_sub1()(n, idx)
-            t.owner.op.assert_fast = True  # input c_contiguous, so we reshape
+            t.owner.op.perform_using_take = True  # input c_contiguous, so we reshape
            val = self.eval_output_and_check(t, list=True)

            val = numpy.asarray(val)
@@ -882,7 +890,7 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):

            # Test with input strided
            t = self.adv_sub1()(n[::-1], idx)
-            t.owner.op.assert_fast = fast
+            t.owner.op.perform_using_take = fast
            val = theano.function([], t, mode=self.mode)()

            val = numpy.asarray(val)