提交 c5944c7d authored 作者: Boris Fomitchev's avatar Boris Fomitchev

Fixed overflow with workspace size, tensor_op setting for FindEx

上级 e596e80e
...@@ -26,6 +26,20 @@ static int c_check_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups ...@@ -26,6 +26,20 @@ static int c_check_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups
#endif #endif
} }
static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 1;
}
#section init_code_struct #section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err); cudnnStatus_t APPLY_SPECIFIC(err);
...@@ -83,19 +97,19 @@ static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg) ...@@ -83,19 +97,19 @@ static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
return err; return err;
} }
static int static size_t
c_get_largest_free_block_size(PyGpuContextObject *c) c_get_largest_free_block_size(PyGpuContextObject *c)
{ {
size_t free = 0; size_t maxfree = 0;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
if (err2 != GA_NO_ERROR) { if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU"); "memory information on the GPU");
} }
// Guess 4Mb if the info is not available // Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024; if (maxfree == 0) maxfree = 4 * 1024 * 1024;
return free; return maxfree;
} }
/** Check if convolution output tensor has expected dimensions /** Check if convolution output tensor has expected dimensions
......
...@@ -186,11 +186,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -186,11 +186,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
std::string hashkey; std::string hashkey;
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
if (params->choose_algo) { if (params->choose_algo) {
if (!reuse_algo) { if (!reuse_algo) {
...@@ -220,12 +221,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -220,12 +221,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
cuda_exit(c->ctx); cuda_exit(c->ctx);
return -1; return -1;
} }
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
// We don't sync the buffer as we don't care about the values. // We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
...@@ -233,7 +236,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -233,7 +236,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem, 1, &count, &choice, *(void **)tmpmem,
free); maxfree);
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -283,8 +286,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -283,8 +286,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
} }
} }
if (dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) { if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -343,12 +347,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -343,12 +347,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n", fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
algorithm_name, algorithm_name,
mathtype == CUDNN_TENSOR_OP_MATH ? "[T]" : "",
params->choose_time ? "(timed)": "" , params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "", reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize, worksize,
hashkey.c_str() hashkey.c_str()
); );
...@@ -361,18 +365,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -361,18 +365,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{ {
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
......
...@@ -156,7 +156,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -156,7 +156,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
std::string hashkey; std::string hashkey;
size_t free = c_get_largest_free_block_size(c); size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1; if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -190,7 +190,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -190,7 +190,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgoPerf_t choice; cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); // set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx); cuda_exit(c->ctx);
...@@ -201,7 +204,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -201,7 +204,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free); 1, &count, &choice, *(void **)tmpmem, maxfree);
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -235,7 +238,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -235,7 +238,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input), desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
...@@ -248,8 +251,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -248,8 +251,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
} }
if (dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) { if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -313,23 +317,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -313,23 +317,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
#endif #endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} }
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
......
...@@ -143,7 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -143,7 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
std::string hashkey ; std::string hashkey ;
size_t free = c_get_largest_free_block_size(c); size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1; if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -176,8 +176,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -176,8 +176,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
int count; int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx); cuda_exit(c->ctx);
...@@ -188,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -188,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free); 1, &count, &choice, *(void **)tmpmem, maxfree);
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -224,7 +227,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -224,7 +227,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -238,8 +241,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -238,8 +241,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
} /* choose_algo */ } /* choose_algo */
if (dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) { if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -303,22 +307,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -303,22 +307,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
#endif #endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} }
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论