提交 c5944c7d authored 作者: Boris Fomitchev's avatar Boris Fomitchev

Fixed overflow with workspace size, tensor_op setting for FindEx

上级 e596e80e
......@@ -26,6 +26,20 @@ static int c_check_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups
#endif
}
static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 1;
}
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
......@@ -83,19 +97,19 @@ static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
return err;
}
static int
static size_t
c_get_largest_free_block_size(PyGpuContextObject *c)
{
size_t free = 0;
size_t maxfree = 0;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
}
// Guess 4Mb if the info is not available
if (free == 0) free = 4 * 1024 * 1024;
return free;
if (maxfree == 0) maxfree = 4 * 1024 * 1024;
return maxfree;
}
/** Check if convolution output tensor has expected dimensions
......
......@@ -186,11 +186,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
std::string hashkey;
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
if (params->choose_algo) {
if (!reuse_algo) {
......@@ -220,12 +221,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
cuda_exit(c->ctx);
return -1;
}
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
// We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx(
......@@ -233,7 +236,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem,
free);
maxfree);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
......@@ -283,8 +286,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
}
}
if (dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
......@@ -343,12 +347,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
mathtype == CUDNN_TENSOR_OP_MATH ? "[T]" : "",
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
);
......@@ -361,18 +365,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
{
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
/*
* This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module
......
......@@ -156,7 +156,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
std::string hashkey;
size_t free = c_get_largest_free_block_size(c);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
......@@ -190,7 +190,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
......@@ -201,7 +204,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
1, &count, &choice, *(void **)tmpmem, maxfree);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -235,7 +238,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
......@@ -248,8 +251,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
}
}
if (dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
......@@ -313,23 +317,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
#endif
if (params->choose_once) {
reuse_algo = 1;
}
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
if (params->choose_once) {
reuse_algo = 1;
}
gpudata *workspace = 0;
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
......
......@@ -143,7 +143,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
std::string hashkey ;
size_t free = c_get_largest_free_block_size(c);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
......@@ -176,8 +176,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem;
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
......@@ -188,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free);
1, &count, &choice, *(void **)tmpmem, maxfree);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
......@@ -224,7 +227,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo);
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
......@@ -238,8 +241,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
}
} /* choose_algo */
if (dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
......@@ -303,22 +307,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
#endif
if (params->choose_once) {
reuse_algo = 1;
}
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
if (params->choose_once) {
reuse_algo = 1;
}
gpudata *workspace = 0;
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论