提交 3109a772 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Add proper wait/record pairs for cudnn wrapper.

Take 2.
上级 ea96b166
...@@ -211,6 +211,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -211,6 +211,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
} }
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
...@@ -223,6 +227,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -223,6 +227,10 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
} }
cuda_exit(c->ctx); cuda_exit(c->ctx);
......
...@@ -177,6 +177,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -177,6 +177,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardData_v3( err = cudnnConvolutionBackwardData_v3(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
...@@ -189,6 +193,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -189,6 +193,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
......
...@@ -178,6 +178,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -178,6 +178,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnConvolutionBackwardFilter_v3( err = cudnnConvolutionBackwardFilter_v3(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(_handle),
alpha_p, alpha_p,
...@@ -190,6 +194,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -190,6 +194,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
......
...@@ -93,12 +93,20 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img, ...@@ -93,12 +93,20 @@ int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
} }
cuda_enter(c->ctx); cuda_enter(c->ctx);
cuda_wait(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingForward( err = cudnnPoolingForward(
APPLY_SPECIFIC(_handle), desc, APPLY_SPECIFIC(_handle), desc,
alpha, alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
beta, beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
cuda_record(img->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx); cuda_exit(c->ctx);
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
......
...@@ -111,6 +111,12 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -111,6 +111,12 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
} }
cuda_enter(c->ctx); cuda_enter(c->ctx);
cuda_wait(out->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(out_grad->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(inp->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnPoolingBackward( err = cudnnPoolingBackward(
APPLY_SPECIFIC(_handle), desc, APPLY_SPECIFIC(_handle), desc,
alpha, alpha,
...@@ -120,6 +126,12 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp, ...@@ -120,6 +126,12 @@ int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
beta, beta,
APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad) APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
); );
cuda_record(out->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(out_grad->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(inp->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*inp_grad)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx); cuda_exit(c->ctx);
} }
......
...@@ -72,6 +72,10 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, ...@@ -72,6 +72,10 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
} }
cuda_enter(c->ctx); cuda_enter(c->ctx);
cuda_wait(x->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxForward( err = cudnnSoftmaxForward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(_handle),
SOFTMAX_ALGO, SOFTMAX_ALGO,
...@@ -83,6 +87,10 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x, ...@@ -83,6 +87,10 @@ int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
PyGpuArray_DEV_DATA(*out) PyGpuArray_DEV_DATA(*out)
); );
cuda_record(x->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*out)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx); cuda_exit(c->ctx);
} }
......
...@@ -85,6 +85,11 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, ...@@ -85,6 +85,11 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
} }
cuda_enter(c->ctx); cuda_enter(c->ctx);
cuda_wait(sm->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(dy->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
err = cudnnSoftmaxBackward( err = cudnnSoftmaxBackward(
APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(_handle),
SOFTMAX_ALGO, SOFTMAX_ALGO,
...@@ -98,6 +103,11 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy, ...@@ -98,6 +103,11 @@ int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
APPLY_SPECIFIC(dx), APPLY_SPECIFIC(dx),
PyGpuArray_DEV_DATA(*dx) PyGpuArray_DEV_DATA(*dx)
); );
cuda_record(sm->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record(dy->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*dx)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
cuda_exit(c->ctx); cuda_exit(c->ctx);
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论