make CudaNdarray_TakeFrom don't crash on older GPU.

fb973a61 · Frederic · e6a2645c · fb973a61 · fb973a61
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -11,7 +11,7 @@ from theano import tensor, scalar, config
 from theano.gof.python25 import all, any
-from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda import GpuOp, device_properties
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda import filter as type_support_filter
@@ -1938,6 +1938,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
    #If True or False, we assert that we use the take version or not
    #If None, we choose the best one applicable
    perform_using_take = None
+    max_threads = 0
    def make_node(self, x, ilist):
        x_ = as_cuda_ndarray_variable(x)
@@ -1977,9 +1978,18 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
            idx = idx.view("float32")
            idx = cuda_ndarray.cuda_ndarray.CudaNdarray(idx)
+            if self.max_threads == 0:
+                num = theano.sandbox.cuda.use.device_number
+                if device_properties(num)['regsPerBlock'] < (8192 * 2):
+                    self.max_threads = 256
+                else:
+                    self.max_threads = 512
            o = x.take(idx,
                       0,  # axis
-                       out_[0][0])  # return
+                       out_[0][0],  # return
+                       "raise",
+                       self.max_threads)
            if x is not x_orig:
                o = o.reshape(out_shape)
            out[0] = o

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -758,8 +758,10 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    PyObject * axis_obj = Py_None;
    PyObject * out_obj = Py_None;
    PyObject * clipmode_obj = NULL;
-    if (! PyArg_ParseTuple(args, "O|OOO", &indices_obj, &axis_obj,
+    int max_threads = 1; // max threads per blocks
-                           &out_obj, &clipmode_obj))
+    if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
+                           &out_obj, &clipmode_obj, &max_threads))
        return NULL;
    //Check argument indices
@@ -937,13 +939,16 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    }
    dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
    switch (self->nd) {
        case 1:
            {
                dim3 n_threads(1, 1, 1);
                if (verbose)
-                    printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
+                    printf("cudaGetLastError=%d, nd=%d"
+                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
+                           self->nd, cudaGetLastError(),
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0],
@@ -964,11 +969,15 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            break;
        case 2:
            {
-                dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], 512), 1, 1);
+                dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
                if (verbose)
-                    printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
+                    printf("cudaGetLastError=%d, nd=%d"
+                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
+                           cudaGetLastError(), self->nd,
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0], //dimensions
                        dims[1],
@@ -988,12 +997,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            break;
        case 3:
            {
-                int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], 512);
+                int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
-                int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], 512 / ty);
+                int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
                dim3 n_threads(tx, ty, 1);
                if (verbose)
-                    printf("kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
+                    printf("cudaGetLastError=%d, nd=%d"
+                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
+                           self->nd, cudaGetLastError(),
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0], //dimensions