Merge pull request #1809 from f0k/release-gil

Release GIL while waiting for the GPU

Merge pull request #1809 from f0k/release-gil
d4f1d4eb · abergeron · 353b0688 · 5ec9a5ce · d4f1d4eb
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -26,6 +26,21 @@
 //if you want this to work.
 #define PRECHECK_ERROR 0
+//If true, we release the GIL around blocking GPU calls, to allow other Python
+//threads to run in the meantime. For a single-threaded program, the overhead
+//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
+//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
+#ifndef RELEASE_GIL
+#define RELEASE_GIL 1
+#endif
+#if RELEASE_GIL
+#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
+#else
+#define CNDA_BEGIN_ALLOW_THREADS
+#define CNDA_END_ALLOW_THREADS
+#endif
 /////////////////////////
 // Alloc and Free
 /////////////////////////
@@ -200,7 +215,9 @@ int device_free(void *ptr)
    // We need sync as the Theano's GC could remove intermediate variable that
    // are still needed as the gpu kernel are running or in the queue.
+    CNDA_BEGIN_ALLOW_THREADS
    cudaThreadSynchronize();
+    CNDA_END_ALLOW_THREADS
    cudaError_t err =  cudaFree(ptr);
    if (cudaSuccess != err)
@@ -518,10 +535,14 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
    assert (PyArray_ITEMSIZE(rval) == sizeof(real));
-    cublasGetVector(PyArray_SIZE(rval), sizeof(real),
+    npy_intp rval_size = PyArray_SIZE(rval);
+    void *rval_data = PyArray_DATA(rval);
+    CNDA_BEGIN_ALLOW_THREADS
+    cublasGetVector(rval_size, sizeof(real),
                    contiguous_self->devdata, 1,
-                    PyArray_DATA(rval), 1);
+                    rval_data, 1);
-    CNDA_THREAD_SYNC;
+    //CNDA_THREAD_SYNC;  // unneeded because cublasGetVector is blocking anyway
+    CNDA_END_ALLOW_THREADS
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
@@ -1217,14 +1238,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    //-10 could be any value different then 0.
    int cpu_err_var=-10;
-    // We are not 100% sure that cudaMemcpy wait that the async gpu kernel are
+    CNDA_BEGIN_ALLOW_THREADS
-    // finished before doing the transfer. So we add this explicit sync as it
+    // As we execute cudaMemcpy on the default stream, it waits for all
-    // is pretty fast. In a python loop, I ran 1 000 000 call in 1 second.
+    // kernels (on all streams) to be finished before starting to copy
-    // It is better to be safe and not significatively slower than unsafe.
-    cudaThreadSynchronize();
    err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
                     cudaMemcpyDeviceToHost);
+    CNDA_END_ALLOW_THREADS
    if (cudaSuccess != err) {
        PyErr_Format(
            PyExc_RuntimeError,
@@ -2838,7 +2857,9 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
 PyObject *
 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
 {
+    CNDA_BEGIN_ALLOW_THREADS
    cudaThreadSynchronize();
+    CNDA_END_ALLOW_THREADS
    Py_INCREF(Py_None);
    return Py_None;
 }
@@ -3554,11 +3575,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
    if (!py_src) {
        return -1;
    }
-    cublasSetVector(PyArray_SIZE(py_src),
+    npy_intp py_src_size = PyArray_SIZE(py_src);
+    void *py_src_data = PyArray_DATA(py_src);
+    CNDA_BEGIN_ALLOW_THREADS
+    cublasSetVector(py_src_size,
            sizeof(real),
-            PyArray_DATA(py_src), 1,
+            py_src_data, 1,
            self->devdata, 1);
-    CNDA_THREAD_SYNC;
+    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
+    CNDA_END_ALLOW_THREADS
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
@@ -4952,7 +4977,7 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
                    1,
                    self->dev_structure,
                    1);
-    CNDA_THREAD_SYNC;
+    //CNDA_THREAD_SYNC;  // unneeded because cublasSetVector is blocking anyway
    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
    {
        PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");