提交 45f4b7a9 authored 作者: f0k's avatar f0k

Added gpu.release_gil config flag.

上级 ce1eeab9
...@@ -200,7 +200,9 @@ int device_free(void *ptr) ...@@ -200,7 +200,9 @@ int device_free(void *ptr)
// We need sync as the Theano's GC could remove intermediate variable that // We need sync as the Theano's GC could remove intermediate variable that
// are still needed as the gpu kernel are running or in the queue. // are still needed as the gpu kernel are running or in the queue.
CNDA_BEGIN_ALLOW_THREADS
cudaThreadSynchronize(); cudaThreadSynchronize();
CNDA_END_ALLOW_THREADS
cudaError_t err = cudaFree(ptr); cudaError_t err = cudaFree(ptr);
if (cudaSuccess != err) if (cudaSuccess != err)
...@@ -518,10 +520,14 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args) ...@@ -518,10 +520,14 @@ PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
assert (PyArray_ITEMSIZE(rval) == sizeof(real)); assert (PyArray_ITEMSIZE(rval) == sizeof(real));
cublasGetVector(PyArray_SIZE(rval), sizeof(real), npy_intp rval_size = PyArray_SIZE(rval);
void *rval_data = PyArray_DATA(rval);
CNDA_BEGIN_ALLOW_THREADS
cublasGetVector(rval_size, sizeof(real),
contiguous_self->devdata, 1, contiguous_self->devdata, 1,
PyArray_DATA(rval), 1); rval_data, 1);
CNDA_THREAD_SYNC; //CNDA_THREAD_SYNC; // unneeded because cublasGetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
...@@ -1217,14 +1223,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){ ...@@ -1217,14 +1223,12 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
//-10 could be any value different then 0. //-10 could be any value different then 0.
int cpu_err_var=-10; int cpu_err_var=-10;
// We are not 100% sure that cudaMemcpy wait that the async gpu kernel are CNDA_BEGIN_ALLOW_THREADS
// finished before doing the transfer. So we add this explicit sync as it // As we execute cudaMemcpy on the default stream, it waits for all
// is pretty fast. In a python loop, I ran 1 000 000 call in 1 second. // kernels (on all streams) to be finished before starting to copy
// It is better to be safe and not significatively slower than unsafe.
cudaThreadSynchronize();
err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int), err = cudaMemcpy(&cpu_err_var, err_var, sizeof(int),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost);
CNDA_END_ALLOW_THREADS
if (cudaSuccess != err) { if (cudaSuccess != err) {
PyErr_Format( PyErr_Format(
PyExc_RuntimeError, PyExc_RuntimeError,
...@@ -2838,7 +2842,9 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy) ...@@ -2838,7 +2842,9 @@ GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
PyObject * PyObject *
CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy) CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
{ {
CNDA_BEGIN_ALLOW_THREADS
cudaThreadSynchronize(); cudaThreadSynchronize();
CNDA_END_ALLOW_THREADS
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -3554,11 +3560,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj) ...@@ -3554,11 +3560,15 @@ CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
if (!py_src) { if (!py_src) {
return -1; return -1;
} }
cublasSetVector(PyArray_SIZE(py_src), npy_intp py_src_size = PyArray_SIZE(py_src);
void *py_src_data = PyArray_DATA(py_src);
CNDA_BEGIN_ALLOW_THREADS
cublasSetVector(py_src_size,
sizeof(real), sizeof(real),
PyArray_DATA(py_src), 1, py_src_data, 1,
self->devdata, 1); self->devdata, 1);
CNDA_THREAD_SYNC; //CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
CNDA_END_ALLOW_THREADS
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory"); PyErr_SetString(PyExc_RuntimeError, "error copying data to device memory");
...@@ -4952,7 +4962,7 @@ cnda_copy_structure_to_device(const CudaNdarray * self) ...@@ -4952,7 +4962,7 @@ cnda_copy_structure_to_device(const CudaNdarray * self)
1, 1,
self->dev_structure, self->dev_structure,
1); 1);
CNDA_THREAD_SYNC; //CNDA_THREAD_SYNC; // unneeded because cublasSetVector is blocking anyway
if (CUBLAS_STATUS_SUCCESS != cublasGetError()) if (CUBLAS_STATUS_SUCCESS != cublasGetError())
{ {
PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory"); PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
......
...@@ -68,6 +68,14 @@ typedef float real; ...@@ -68,6 +68,14 @@ typedef float real;
#define CNDA_THREAD_SYNC cudaThreadSynchronize(); #define CNDA_THREAD_SYNC cudaThreadSynchronize();
#endif #endif
// Define shortcuts to implement the config.gpu.release_gil flag
#ifdef RELEASE_GIL
#define CNDA_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS Py_END_ALLOW_THREADS
#else
#define CNDA_BEGIN_ALLOW_THREADS
#define CNDA_END_ALLOW_THREADS
#endif
#ifndef SHARED_SIZE #ifndef SHARED_SIZE
#define SHARED_SIZE (16*1024) #define SHARED_SIZE (16*1024)
......
...@@ -30,6 +30,12 @@ AddConfigVar('nvcc.compiler_bindir', ...@@ -30,6 +30,12 @@ AddConfigVar('nvcc.compiler_bindir',
StrParam(""), StrParam(""),
in_c_key=False) in_c_key=False)
AddConfigVar('gpu.release_gil',
"If True, theano will release the GIL when waiting for "
"GPU operations, allowing other Python threads to run",
BoolParam(False),
in_c_key=True)
user_provided_cuda_root = True user_provided_cuda_root = True
...@@ -153,6 +159,8 @@ class NVCC_compiler(object): ...@@ -153,6 +159,8 @@ class NVCC_compiler(object):
flags = [flag for flag in config.nvcc.flags.split(' ') if flag] flags = [flag for flag in config.nvcc.flags.split(' ') if flag]
if config.nvcc.fastmath: if config.nvcc.fastmath:
flags.append('-use_fast_math') flags.append('-use_fast_math')
if config.gpu.release_gil:
flags.append('-DRELEASE_GIL')
cuda_ndarray_cuh_hash = hash_from_file( cuda_ndarray_cuh_hash = hash_from_file(
os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh')) os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh'))
flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash) flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论