提交 d4f1d4eb authored 作者: abergeron's avatar abergeron

Merge pull request #1809 from f0k/release-gil

Release GIL while waiting for the GPU
...@@ -26,6 +26,21 @@ ...@@ -26,6 +26,21 @@
//if you want this to work. //if you want this to work.
#define PRECHECK_ERROR 0 #define PRECHECK_ERROR 0
//If true, we release the GIL around blocking GPU calls, to allow other Python
//threads to run in the meantime. For a single-threaded program, the overhead
//is neglectible (about 20ms for 1 million GIL release/reclaim cycles). Can
//still be overridden on compilation with -DRELEASE_GIL=0 in nvcc.flags.
#ifndef RELEASE_GIL
#define RELEASE_GIL 1
#endif
#if RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
///////////////////////// /////////////////////////
// Alloc and Free // Alloc and Free
///////////////////////// /////////////////////////
...@@ -200,7 +215,9 @@ int device_free(void *ptr) ...@@ -200,7 +215,9 @@ int device_free(void *ptr)
// We need sync as the Theano's GC could remove intermediate variable that // We need sync as the Theano's GC could remove intermediate variable that
// are still needed as the gpu kernel are running or in the queue. // are still needed as the gpu kernel are running or in the queue.
CNDA_BEGIN_ALLOW_THREADS
cudaThreadSynchronize(); cudaThreadSynchronize();
CNDA_END_ALLOW_THREADS
cudaError_t err = cudaFree(ptr); cudaError_t err = cudaFree(ptr);
if (cudaSuccess != err) if (cudaSuccess != err)
...@@ -518,10 +535,14 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args) ...@@ -518,10 +535,14 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
assert (PyArray_ITEMSIZE(rval) == sizeof(real)); assert (PyArray_ITEMSIZE(rval) == sizeof(real));
cublasGetVector(PyArray_SIZE(rval), sizeof(real), npy_intp rval_size = PyArray_SIZE(rval);
void *rval_data = PyArray_DATA(rval);
CNDA_BEGIN_ALLOW_THREADS
cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1, contiguous_self->devdata, 1,
PyArray_DATA(rval), 1); rval_data, 1);
CNDA_THREAD_SYNC; //CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
...@@ -1217,14 +1238,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1217,14 +1238,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
//-10 could be any value different then 0. //-10 could be any value different then 0.
int cpu_err_var=-10; int cpu_err_var=-10;
// We are not 100% sure that cudaMemcpy wait that the async gpu kernel are CNDA_BEGIN_ALLOW_THREADS
// finished before doing the transfer. So we add this explicit sync as it // As we execute cudaMemcpy on the default stream, it waits for all
// is pretty fast. In a python loop, I ran 1 000 000 call in 1 second. // kernels (on all streams) to be finished before starting to copy
// It is better to be safe and not significatively slower than unsafe.
cudaThreadSynchronize();
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int), err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) { if (cudaSuccess != err) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -2838,7 +2857,9 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy) ...@@ -2838,7 +2857,9 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
PyObject * PyObject *
CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
{ {
CNDA_BEGIN_ALLOW_THREADS
cudaThreadSynchronize(); cudaThreadSynchronize();
CNDA_END_ALLOW_THREADS
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -3554,11 +3575,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) ...@@ -3554,11 +3575,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
if (!py_src) { if (!py_src) {
return -1; return -1;
} }
cublasSetVector(PyArray_SIZE(py_src), npy_intp py_src_size = PyArray_SIZE(py_src);
void *py_src_data = PyArray_DATA(py_src);
CNDA_BEGIN_ALLOW_THREADS
cublasSetVector(py_src_size,
sizeof(real), sizeof(real),
PyArray_DATA(py_src), 1, py_src_data, 1,
self->devdata, 1); self->devdata, 1);
CNDA_THREAD_SYNC; //CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory"); PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
...@@ -4952,7 +4977,7 @@ cnda_copy_structure_to_device(const CudaNdarray * self) ...@@ -4952,7 +4977,7 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
1, 1,
self->dev_structure, self->dev_structure,
1); 1);
CNDA_THREAD_SYNC; //CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory"); PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论