refactore the new code

d08a40fa · Frederic · 574e7965 · d08a40fa · d08a40fa · d08a40fa
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2974,7 +2974,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        return Apply(self, [x_, y_, ilist_], [x_.type()])
    def c_code_cache_version(self):
-        return (5,)
+        return (6,)
    def c_code(self, node, name, inputs, outputs, sub):
        active_device_no = theano.sandbox.cuda.active_device_number()
@@ -3048,23 +3048,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
        int CudaNdarray_vector_add_fast(CudaNdarray* py_self,
            CudaNdarray* py_other, PyArrayObject *indices_arr)
        {
-                if (err_var == NULL) {
+                if(init_err_var()!= 0) return -1;
-                    err_var = (int*)device_malloc(sizeof(int));
-                    if (!err_var) { // PyErr set by device_malloc
-                        return -1;
-                    }
-                    cudaError_t err = cudaMemset((void*)err_var, 0,
-                                                 sizeof(int));
-                    if (cudaSuccess != err) {
-                        // Clear the error flag, cudaMemset doesn't do it.
-                        cudaGetLastError();
-                        PyErr_Format(
-                            PyExc_RuntimeError,
-                            "Error setting device error code to 0. %%s",
-                            cudaGetErrorString(err));
-                        return -1;
-                    }
-                }
     		const int *shapeX = CudaNdarray_HOST_DIMS(py_self);
     		const int *shapeY = CudaNdarray_HOST_DIMS(py_other);
     		const int *strX   = CudaNdarray_HOST_STRIDES(py_self);
@@ -3112,46 +3097,13 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
                                                           PyArray_SIZE(indices_arr),
                                                           err_var
                                                          );
+                int index_err = check_err_var();
-                //-10 could be any value different then 0.
-                int cpu_err_var=-10;
-                CNDA_BEGIN_ALLOW_THREADS
-                // As we execute cudaMemcpy on the default stream, it waits
-                // for all kernels (on all streams) to be finished before
-                // starting to copy
-                err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
-                                 cudaMemcpyDeviceToHost);
-                CNDA_END_ALLOW_THREADS
-                if (cudaSuccess != err) {
-                    PyErr_Format(
-                        PyExc_RuntimeError,
-                        "Cuda error: %%s: %%s when trying to get the error"
-                        " value.\\n",
-                        "GpuAdvancedIncSubtensor1_dev20",
-                        cudaGetErrorString(err));
-                    return NULL;
-                }
-                if (cpu_err_var != 0) {
-                    PyErr_Format(
-                        PyExc_IndexError,
-                        "GpuAdvancedIncSubtensor1_dev20: One of the index"
-                        " value is out of bound. Error code: %%i.\\n",
-                        cpu_err_var);
-                    // Must reset it to 0 to don't reset it before each use.
-                    err = cudaMemset((void*)err_var, 0, sizeof(int));
-                    if (cudaSuccess != err) {
-                        PyErr_Format(PyExc_MemoryError,
-                            "Error setting device error code to 0 after having"
-                            " an index error. %%s", cudaGetErrorString(err));
-                        return -1;
-                    }
-                    return -1;
-                }
                device_free(d_indices_arr);
                Py_XDECREF(cpu_indices_arr);
+                if(index_err != 0) return -1;
                err = cudaGetLastError();
                if(err != cudaSuccess){
                    PyErr_Format(

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1141,30 +1141,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    k3 = k_take_3<CPY>;
    // Create the memory place that will store the error information.
-    if (err_var == NULL) {
+    if(init_err_var() != 0) return NULL;
-        err_var = (int*)device_malloc(sizeof(int));
-        if (!err_var) { // PyErr set by device_malloc
-            Py_DECREF(indices);
-            Py_DECREF(out);
-            free(dims);
-            return NULL;
-        }
-        cudaError_t err = cudaMemset((void*)err_var, 0, sizeof(int));
-        if (cudaSuccess != err) {
-            // Clear the error flag, cudaMemset doesn't do it.
-            // Currently this returns the same thing as err, but if in future
-            // it returns something else I still don't see why we should ignore
-            // it.  All we want to do here is reset the flag.
-            cudaGetLastError();
-            PyErr_Format(PyExc_RuntimeError,
-                         "Error setting device error code to 0. %s",
-                         cudaGetErrorString(err));
-            Py_DECREF(indices);
-            Py_DECREF(out);
-            free(dims);
-            return NULL;
-        }
-    }
    dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
    if(CudaNdarray_HOST_DIMS(out)[0] == 0){
@@ -1276,47 +1253,14 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
        Py_DECREF(out);
        return NULL;
    }
-    //-10 could be any value different then 0.
-    int cpu_err_var=-10;
-    CNDA_BEGIN_ALLOW_THREADS
-    // As we execute cudaMemcpy on the default stream, it waits for all
-    // kernels (on all streams) to be finished before starting to copy
-    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
-                     cudaMemcpyDeviceToHost);
-    CNDA_END_ALLOW_THREADS
-    if (cudaSuccess != err) {
-        PyErr_Format(
-            PyExc_RuntimeError,
-            "Cuda error: %s: %s when trying to get the error value.\n",
-            "CudaNdarray_TakeFrom",
-            cudaGetErrorString(err));
-        Py_DECREF(indices);
-        Py_DECREF(out);
-        return NULL;
-    }
-    if (cpu_err_var != 0) {
+    int index_err = check_err_var();
-        PyErr_Format(
-            PyExc_IndexError,
-            "CudaNdarray_TakeFrom: One of the index value is out of bound. Error code: %i.\n",
-            cpu_err_var);
-        // Must reset it to 0 to don't reset it before each use.
-        err = cudaMemset((void*)err_var, 0, sizeof(int));
-        if (cudaSuccess != err) {
-            PyErr_Format(PyExc_MemoryError, "Error setting device error code to 0 after having an index error. %s", cudaGetErrorString(err));
-            Py_DECREF(indices);
-            Py_DECREF(out);
-            return NULL;
-        }
    Py_DECREF(indices);
+    if (index_err != 0) {
        Py_DECREF(out);
        return NULL;
    }
-    Py_DECREF(indices);
    if (verbose) printf("TAKE SUCCEDED\n");
    return (PyObject *)out;
 }

--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -122,7 +122,69 @@ DllExport void *get_work_mem(size_t sz);
 // When it is allocated, it should always be 0
 // So if there is an error, we must reset it to 0 BEFORE we raise the error
 // This prevent us from setting it to 0 before each use
-static int* err_var = NULL;
+extern DllExport int* err_var = NULL;
+DllExport inline int init_err_var(){
+    if (err_var == NULL) {
+        err_var = (int*)device_malloc(sizeof(int));
+        if (!err_var) { // PyErr set by device_malloc
+            return -1;
+        }
+        cudaError_t err = cudaMemset((void*)err_var, 0,
+                                     sizeof(int));
+        if (cudaSuccess != err) {
+            // Clear the error flag, cudaMemset doesn't do it.
+            cudaGetLastError();
+            PyErr_Format(
+                PyExc_RuntimeError,
+                "Error setting device error code to 0. %s",
+                cudaGetErrorString(err));
+            return -1;
+        }
+    }
+    return 0;
+}
+DllExport inline int check_err_var(){
+    //-10 could be any value different then 0.
+    int cpu_err_var=-10;
+    cudaError_t err;
+    CNDA_BEGIN_ALLOW_THREADS
+    // As we execute cudaMemcpy on the default stream, it waits
+    // for all kernels (on all streams) to be finished before
+    // starting to copy
+    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
+                     cudaMemcpyDeviceToHost);
+    CNDA_END_ALLOW_THREADS
+    if (cudaSuccess != err) {
+        PyErr_Format(
+            PyExc_RuntimeError,
+            "Cuda error: %s when trying to get the error"
+            " value.\\n",
+            cudaGetErrorString(err));
+        return -1;
+    }
+    if (cpu_err_var != 0) {
+        PyErr_Format(
+            PyExc_IndexError,
+            "One of the index value is out of bound. Error code: %i.\\n",
+            cpu_err_var);
+        // Must reset it to 0 to don't reset it before each use.
+        err = cudaMemset((void*)err_var, 0, sizeof(int));
+        if (cudaSuccess != err) {
+            PyErr_Format(PyExc_MemoryError,
+                "Error setting device error code to 0 after having"
+                " an index error. %s", cudaGetErrorString(err));
+            return -1;
+        }
+        return -1;
+    }
+    return 0;
+}
 template <typename T>
 static T ceil_intdiv(T a, T b)