提交 3a59bd8c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Major adaptation to handle explicit context activation.

上级 babe6f1b
#section support_code #section support_code
static cudnnHandle_t _handle = NULL;
static int static int
c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
...@@ -99,15 +98,21 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -99,15 +98,21 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
#section init_code #section init_code
{ setup_ext_cuda();
cudnnStatus_t err;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) { #section support_code_struct
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err)); cudnnHandle_t _handle;
#if PY_MAJOR_VERSION >= 3
return NULL; #section init_code_struct
#else
return; cuda_enter(pygpu_default_context()->ctx);
#endif cudnnStatus_t err;
} _handle = NULL;
if ((err = cudnnCreate(&_handle)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not create cuDNN handle: %s",
cudnnGetErrorString(err));
cuda_exit(pygpu_default_context()->ctx);
FAIL;
} }
cuda_exit(pygpu_default_context()->ctx);
...@@ -10,6 +10,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -10,6 +10,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -43,8 +44,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -43,8 +44,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
Py_INCREF(*output); Py_INCREF(*output);
#else #else
if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om),
om->ga.typecode, GA_C_ORDER, om->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*output, om)) if (beta != 0.0 && pygpu_move(*output, om))
return 1; return 1;
...@@ -55,6 +55,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -55,6 +55,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO; cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO #ifdef CHOOSE_ALGO
/* Static variables are only initialized once so this will not /* Static variables are only initialized once so this will not
* reset the previous algo every time */ * reset the previous algo every time */
...@@ -86,6 +87,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -86,6 +87,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
algo = choice.algo; algo = choice.algo;
...@@ -96,6 +98,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -96,6 +98,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU: %s\n", "memory information on the GPU: %s\n",
cudaGetErrorString(err2)); cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -107,6 +110,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -107,6 +110,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
#endif #endif
...@@ -145,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -145,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -167,6 +172,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -167,6 +172,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
"are padded such that the padded inputs are larger " "are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN " "than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue."); "to V3 or more recent to solve the issue.");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -175,7 +181,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -175,7 +181,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
...@@ -187,6 +192,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -187,6 +192,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting worksize: %s", "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -196,11 +202,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -196,11 +202,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
* to place a nice get_work_mem() function in. * to place a nice get_work_mem() function in.
*/ */
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory"); "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -218,6 +224,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -218,6 +224,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
} }
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
......
...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im),
im->ga.typecode, GA_C_ORDER, im->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*input, im)) if (beta != 0.0 && pygpu_move(*input, im))
return 1; return 1;
...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO #ifdef CHOOSE_ALGO
static int reuse_algo = 0; static int reuse_algo = 0;
static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO; static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO;
...@@ -83,6 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -83,6 +85,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -94,6 +97,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -94,6 +97,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudaGetLastError(); cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2)); "information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -104,6 +108,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -104,6 +108,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
#endif #endif
...@@ -136,6 +141,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -136,6 +141,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -149,7 +155,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -149,7 +155,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
_handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
...@@ -158,15 +163,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -158,15 +163,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory"); "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -183,6 +189,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -183,6 +189,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
......
...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -9,6 +9,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
float af = alpha, bf = beta; float af = alpha, bf = beta;
void *alpha_p; void *alpha_p;
void *beta_p; void *beta_p;
PyGpuContextObject *c = pygpu_default_context();
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -42,8 +43,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else #else
if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km),
km->ga.typecode, GA_C_ORDER, km->ga.typecode, GA_C_ORDER, c) != 0)
pygpu_default_context()) != 0)
return 1; return 1;
if (beta != 0.0 && pygpu_move(*kerns, km)) if (beta != 0.0 && pygpu_move(*kerns, km))
return 1; return 1;
...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -54,6 +54,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO;
cuda_enter(c->ctx);
#ifdef CHOOSE_ALGO #ifdef CHOOSE_ALGO
static int reuse_algo = 0; static int reuse_algo = 0;
static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO;
...@@ -84,6 +86,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -84,6 +86,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -95,6 +98,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -95,6 +98,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cudaGetLastError(); cudaGetLastError();
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory "
"information on the GPU: %s\n", cudaGetErrorString(err2)); "information on the GPU: %s\n", cudaGetErrorString(err2));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -106,6 +110,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -106,6 +110,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
#endif #endif
...@@ -138,6 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -138,6 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -151,7 +157,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -151,7 +157,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, _handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
...@@ -160,14 +165,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -160,14 +165,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} }
if (worksize != 0) { if (worksize != 0) {
c = pygpu_default_context();
workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1; return 1;
} }
} }
...@@ -184,6 +190,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -184,6 +190,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (worksize != 0) if (worksize != 0)
c->ops->buffer_release(workspace); c->ops->buffer_release(workspace);
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
......
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool)(PyGpuArrayObject *img,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **out) {
cudnnStatus_t err;
size_t dims[5];
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&img->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (c_set_tensorNd(img, APPLY_SPECIFIC(input)) != 0)
return 1;
cudnnPoolingMode_t mode;
int w[3];
int p[3];
int s[3];
int ndims;
err = cudnnGetPoolingNdDescriptor(desc, 3, &mode, &ndims, w, p, s);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error doing cudnnGetPoolingDescriptor operation: %s",
cudnnGetErrorString(err));
return 1;
}
dims[0] = PyGpuArray_DIM(img, 0);
dims[1] = PyGpuArray_DIM(img, 1);
dims[2] = (PyGpuArray_DIM(img, 2) + (p[0]*2) - w[0]) / s[0] + 1;
dims[3] = (PyGpuArray_DIM(img, 3) + (p[1]*2) - w[1]) / s[1] + 1;
if (ndims == 3)
dims[4] = (PyGpuArray_DIM(img, 4) + (p[2]*2) - w[2]) / s[2] + 1;
if (theano_prep_output(out, ndims+2, dims, img->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alpha = 1;
const float beta = 0;
cuda_enter(c->ctx);
err = cudnnPoolingForward(
_handle, desc,
&alpha,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(img),
&beta,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*out));
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnPool: error doing cudnnPoolingForward operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(input_grad);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output_grad);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(input_grad) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(output_grad) = NULL;
{
cudnnStatus_t err;
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (input_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output): %s",
cudnnGetErrorString(err));
FAIL;
}
if ((err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output_grad))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError,
"could not allocate tensor descriptor (output_grad): %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input)); }
if (APPLY_SPECIFIC(input_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input_grad)); }
if (APPLY_SPECIFIC(output) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); }
if (APPLY_SPECIFIC(output_grad) != NULL) { cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output_grad)); }
#section support_code_struct
int APPLY_SPECIFIC(dnn_pool_grad)(PyGpuArrayObject *inp,
PyGpuArrayObject *out,
PyGpuArrayObject *out_grad,
cudnnPoolingDescriptor_t desc,
PyGpuArrayObject **inp_grad) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (!GpuArray_IS_C_CONTIGUOUS(&inp->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous inputs are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out_grad->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous input gradients are supported.");
return 1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&out->ga)) {
PyErr_SetString(PyExc_ValueError, "Only contiguous outputs are supported.");
return 1;
}
if (c_set_tensorNd(inp, APPLY_SPECIFIC(input)) != 0)
return 1;
if (c_set_tensorNd(out_grad, APPLY_SPECIFIC(output_grad)) != 0)
return 1;
if (c_set_tensorNd(out, APPLY_SPECIFIC(output)) != 0)
return 1;
if (theano_prep_output(inp_grad, PyGpuArray_NDIM(inp),
PyGpuArray_DIMS(inp), out->ga.typecode,
GA_C_ORDER, pygpu_default_context()) != 0) {
return 1;
}
if (c_set_tensorNd(*inp_grad, APPLY_SPECIFIC(input_grad)) != 0)
return 1;
{
const float alpha = 1;
const float beta = 0;
cuda_enter(c->ctx);
err = cudnnPoolingBackward(
_handle, desc,
&alpha,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(out),
APPLY_SPECIFIC(output_grad), PyGpuArray_DEV_DATA(out_grad),
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(inp),
&beta,
APPLY_SPECIFIC(input_grad), PyGpuArray_DEV_DATA(*inp_grad)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s.",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
#section init_code_struct
APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(input));
if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
#section support_code_struct
int APPLY_SPECIFIC(softmax)(PyGpuArrayObject *x,
PyGpuArrayObject **out) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(x, APPLY_SPECIFIC(input)) != 0)
return 1;
if (theano_prep_output(out, PyGpuArray_NDIM(x),
PyGpuArray_DIMS(x), x->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(output)) != 0)
return 1;
{
const float alpha = 1.;
const float beta = 0.;
cuda_enter(c->ctx);
err = cudnnSoftmaxForward(
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
(void *)&alpha,
APPLY_SPECIFIC(input),
PyGpuArray_DEV_DATA(x),
(void *)&beta,
APPLY_SPECIFIC(output),
PyGpuArray_DEV_DATA(*out)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
#section support_code_struct
cudnnTensorDescriptor_t APPLY_SPECIFIC(dy);
cudnnTensorDescriptor_t APPLY_SPECIFIC(sm);
cudnnTensorDescriptor_t APPLY_SPECIFIC(out);
#section init_code_struct
APPLY_SPECIFIC(dy) = NULL;
APPLY_SPECIFIC(sm) = NULL;
APPLY_SPECIFIC(out) = NULL;
{
cudnnStatus_t err;
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(dy));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(sm));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
err = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(out));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor: %s",
cudnnGetErrorString(err));
FAIL;
}
}
#section cleanup_code_struct
if (APPLY_SPECIFIC(dy) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(dy));
if (APPLY_SPECIFIC(sm) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(sm));
if (APPLY_SPECIFIC(out) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(out));
#section support_code_struct
int APPLY_SPECIFIC(softmax_grad)(PyGpuArrayObject *dy,
PyGpuArrayObject *sm,
PyGpuArrayObject **out) {
cudnnStatus_t err;
PyGpuContextObject *c = pygpu_default_context();
if (c_set_tensorNd(dy, APPLY_SPECIFIC(dy)) != 0)
return 1;
if (c_set_tensorNd(sm, APPLY_SPECIFIC(sm)) != 0)
return 1;
if (theano_prep_output(out, PyGpuArray_NDIM(dy),
PyGpuArray_DIMS(dy), dy->ga.typecode,
GA_C_ORDER, c) != 0)
return 1;
if (c_set_tensorNd(*out, APPLY_SPECIFIC(out)) != 0)
return 1;
{
const float alpha = 1.;
const float beta = 0.;
cuda_enter(c->ctx);
err = cudnnSoftmaxBackward(
_handle,
SOFTMAX_ALGO,
SOFTMAX_MODE,
(void *)&alpha,
APPLY_SPECIFIC(sm),
PyGpuArray_DEV_DATA(sm),
APPLY_SPECIFIC(dy),
PyGpuArray_DEV_DATA(dy),
(void*) &beta,
APPLY_SPECIFIC(out),
PyGpuArray_DEV_DATA(*out)
);
cuda_exit(c->ctx);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error during operation: %s",
cudnnGetErrorString(err));
return 1;
}
return 0;
}
...@@ -207,11 +207,10 @@ def test_pooling(): ...@@ -207,11 +207,10 @@ def test_pooling():
(32, 1, 147, 197), (32, 1, 147, 197),
]: ]:
data = numpy.random.normal(0, 1, shp).astype("float32") data = numpy.random.normal(0, 1, shp).astype("float32")
a = f1(data).__array__() a = f1(data)
b = f2(data)
b = f2(data).__array__() utt.assert_allclose(a, b)
assert numpy.allclose(a, b,
atol=numpy.finfo(numpy.float32).eps)
# Test the grad # Test the grad
for shp in [(1, 1, 2, 2), for shp in [(1, 1, 2, 2),
...@@ -228,9 +227,9 @@ def test_pooling(): ...@@ -228,9 +227,9 @@ def test_pooling():
def fn(x): def fn(x):
return max_pool_2d(x, (ws, ws), ignore_border=True, return max_pool_2d(x, (ws, ws), ignore_border=True,
padding=pad, mode=mode) padding=pad, mode=mode)
theano.tests.unittest_tools.verify_grad(fn, [data], utt.verify_grad(fn, [data],
cast_to_output_type=False, cast_to_output_type=False,
mode=mode_with_gpu) mode=mode_with_gpu)
# Confirm that the opt would have inserted it. # Confirm that the opt would have inserted it.
fg = theano.function([x], theano.grad(fn(x).sum(), x), fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu) mode=mode_with_gpu)
...@@ -245,10 +244,9 @@ def test_pooling(): ...@@ -245,10 +244,9 @@ def test_pooling():
pad=pad, pad=pad,
mode=mode) mode=mode)
return dnn_op return dnn_op
theano.tests.unittest_tools.verify_grad( utt.verify_grad(fn, [data],
fn, [data], cast_to_output_type=False,
cast_to_output_type=False, mode=mode_with_gpu)
mode=mode_with_gpu)
# Confirm that we get the good op. # Confirm that we get the good op.
fg = theano.function([x], theano.grad(fn(x).sum(), x), fg = theano.function([x], theano.grad(fn(x).sum(), x),
mode=mode_with_gpu) mode=mode_with_gpu)
...@@ -256,7 +254,7 @@ def test_pooling(): ...@@ -256,7 +254,7 @@ def test_pooling():
for node in fg.maker.fgraph.toposort()]) for node in fg.maker.fgraph.toposort()])
g_out = fg(data) g_out = fg(data)
# Compare again the CPU result # Compare against the CPU result
out = max_pool_2d(x, (ws, ws), out = max_pool_2d(x, (ws, ws),
padding=pad, padding=pad,
ignore_border=True, mode=mode) ignore_border=True, mode=mode)
...@@ -269,7 +267,7 @@ def test_pooling(): ...@@ -269,7 +267,7 @@ def test_pooling():
assert any([isinstance(node.op, AveragePoolGrad) assert any([isinstance(node.op, AveragePoolGrad)
for node in fc.maker.fgraph.toposort()]) for node in fc.maker.fgraph.toposort()])
c_out = fc(data) c_out = fc(data)
assert numpy.allclose(c_out, g_out) utt.assert_allclose(c_out, g_out)
def test_pooling_opt(): def test_pooling_opt():
...@@ -703,7 +701,7 @@ class test_SoftMax(test_nnet.test_SoftMax): ...@@ -703,7 +701,7 @@ class test_SoftMax(test_nnet.test_SoftMax):
out = f(data) out = f(data)
gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0]
assert numpy.allclose(out, gout), numpy.absolute(out - gout) utt.assert_allclose(out, gout)
x = T.matrix('x', 'float32') x = T.matrix('x', 'float32')
x_gpu = T.tensor4('x_gpu', 'float32') x_gpu = T.tensor4('x_gpu', 'float32')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论