提交 eed9d97d authored 作者: vesis84's avatar vesis84

bugfix for automatic GPU selection when 'exclusive mode' is used

- fixing an issue introduced with CUDA-TK 7.0, when automatic selection of free gpu by os / library no longer works. Library always selects '0', which leads to crash in case of 'exclusive mode'. - this fix is inspired by Dan Povey's fix for Kaldi: https://github.com/kaldi-asr/kaldi/commit/6548565445167e00125848f91d7da5f3f949b2a2 - it does a loop over gpus until a free gpu is taken.
上级 975e0d2b
...@@ -2948,21 +2948,31 @@ PyObject * ...@@ -2948,21 +2948,31 @@ PyObject *
CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy) CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
{ {
void * rval = NULL; void * rval = NULL;
cudaError_t err;
int num_gpus = 0;
cudaError_t err = cudaMalloc(&rval, 4); err = cudaGetDeviceCount(&num_gpus);
if (cudaSuccess != err){ if (cudaSuccess != err){
printf("ERR!\\n"); printf("ERR!\\n");
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Not able to do basic stuff on the GPU (alloc of 4 bytes) (%s).", "Not able to get number of GPUs (%s).",
cudaGetErrorString(err)); cudaGetErrorString(err));
return NULL; return NULL;
} }
err = cudaFree(rval);
for (int device = 0; device < num_gpus; device++) {
cudaSetDevice(device);
err = cudaDeviceSynchronize(); // << CUDA context gets created here.
cudaGetLastError(); // reset the error state
if (cudaSuccess == err)
break;
}
if (cudaSuccess != err){ if (cudaSuccess != err){
printf("ERR!\\n"); printf("ERR!\\n");
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Not able to do basic stuff on the GPU (cudaFree failed) (%s).", "Not able to select available GPU from %d cards (%s).",
cudaGetErrorString(err)); num_gpus, cudaGetErrorString(err));
return NULL; return NULL;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论