提交 87641e6f authored 作者: notoraptor's avatar notoraptor

Remove `AlgoRec.dataType` field as it is used nowhere.

Add some tests, to check at least if everything runs well. Standardize all three files (dnn_fwd, dnn_gi), to have same code logic and organization across these three codes. Add fallback entries as fallback functions. Add `-t` to hash for timed algorithms.
上级 5ce08bc8
......@@ -17,7 +17,6 @@ static inline int cudnnGetVersion() {
/* a common struct for all 3 CUDNN enums */
struct AlgoRec {
int algo;
cudnnDataType_t dataType;
size_t wsSize;
cudnnMathType_t mathType;
};
......
#section init_code_struct
reuse_algo = 0;
use_cached = 0;
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0;
hash_prefix = std::string("FWD|GPU#");
#section support_code_struct
#line 11 "dnn_fwd.c"
#line 9 "dnn_fwd.c"
int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_fwd_fallback(cudnnConvolutionFwdAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionFwdAlgo_t algo = *_algo;
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if (PyGpuArray_NDIM(input) == 5 &&
!(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536)
{
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#ifdef DEBUG
fprintf(stderr, "(replacing fwd algo fft with none)\n");
#endif
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#ifdef DEBUG
fprintf(stderr, "(replacing fwd algo fft_tiling with none)\n");
#endif
}
}
}
*_algo = algo;
return 0;
}
int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
......@@ -25,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
......@@ -93,10 +186,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
std::string hashkey;
#ifdef DEBUG
char algorithm_name[128];
#endif
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
......@@ -107,11 +196,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
if (hashkey.empty())
if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1;
hashkey = hash_prefix + pci_id + " " + hashkey;
// check out cache
}
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
......@@ -132,6 +223,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1;
}
......@@ -155,11 +247,13 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed FWD algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
......@@ -170,10 +264,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
// NB: It is added again later to cqche,
// so maybe this line could be removed.
} else {
err = cudnnGetConvolutionForwardAlgorithm(
......@@ -194,6 +284,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
}
}
if (dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
......@@ -205,45 +300,49 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo,
&worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
fprintf(stderr, "(%s error getting worksize: "
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
}
fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
algo,
&worksize);
}
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
// save for next time/cache
prev_algo.wsSize = worksize;
prev_algo.algo = algo;
if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
// algo may have changed due to fallback, we must update it.
prev_algo.algo = algo;
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache, even if this node use *_once algo
// (in case the user specify the algo per layer and not globally).
if (params->choose_algo)
// Add to the cache if we choose on shape change, or first time if we choose once.
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
params->choose_time ? "(timed)": "" ,
......@@ -264,7 +363,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
......@@ -282,8 +381,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
......@@ -294,15 +392,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionForward(
params->handle,
alpha_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
err = cudnnConvolutionForward(
params->handle,
alpha_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
}
if (worksize != 0)
......@@ -316,7 +414,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv FWD operation: %s",
cudnnGetErrorString(err));
return 1;
}
......
#section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0;
use_cached = 0;
hash_prefix = std::string("GI|GPU#");
#section support_code_struct
#line 11 "dnn_gi.c"
#line 9 "dnn_gi.c"
int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gi_fallback(cudnnConvolutionBwdDataAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdDataAlgo_t algo = *_algo;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradinput algo fft with none)\n");
#endif
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradinput algo fft_tiling with none)\n");
#endif
}
}
}
*_algo = algo;
return 0;
}
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
......@@ -24,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
......@@ -73,7 +132,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0;
}
int groups = c_check_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1;
......@@ -88,178 +146,155 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey;
if (params->choose_algo && !reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache
hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty())
return 1;
hashkey = hash_prefix + pci_id + " " + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
if (params->choose_algo && !(reuse_algo || use_cached)) {
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (params->choose_algo) {
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache
hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1;
}
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradinput algo: %s",
cudnnGetErrorString(choice.status));
if (reuse_algo || use_cached) {
algo = (cudnnConvolutionBwdDataAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
}
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting best-timed gradinput algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
}
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
}
}
if (dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
// save for next time/cache
prev_algo.wsSize = worksize;
} // !(reuse_algo || use_cached || params->choose_time)
if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
// algo may have changed due to fallback, we must update it.
prev_algo.algo = algo;
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo);
} // !(reuse_algo || use_cached || params->choose_time)
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
......@@ -280,7 +315,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
......@@ -293,8 +328,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"Could not allocate working memory");
PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
cuda_exit(c->ctx);
return 1;
}
......@@ -304,8 +338,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < groups; g++)
{
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardData(
params->handle,
alpha_p,
......@@ -326,7 +359,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradinput operation: %s",
cudnnGetErrorString(err));
return 1;
}
......
#section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0;
use_cached = 0;
hash_prefix = std::string("GW|GPU#");
#section support_code_struct
#line 11 "dnn_gw.c"
#line 9 "dnn_gw.c"
int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdFilterAlgo_t algo = *_algo;
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradweight algo fft with none)\n");
#endif
}
}
*_algo = algo;
return 0;
}
int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
......@@ -24,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
void *beta_p;
float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError,
......@@ -82,17 +128,14 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey ;
size_t free = c_get_largest_free_block_size(c);
......@@ -105,11 +148,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
if (hashkey.empty())
if (hashkey.empty()) {
cuda_exit(c->ctx);
return 1;
hashkey = hash_prefix + pci_id + hashkey;
// check out cache
}
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
......@@ -122,7 +167,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) {
int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice;
......@@ -131,6 +175,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1;
}
......@@ -152,11 +197,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradweight algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
......@@ -167,8 +214,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
......@@ -189,49 +234,57 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
} /* choose_algo */
if (dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
{
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
}
if (err != CUDNN_STATUS_SUCCESS) {
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0",
algorithm_name, cudnnGetErrorString(err));
#endif
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
}
// save for next time/cache
prev_algo.wsSize = worksize;
prev_algo.algo = algo;
if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
// algo may have changed due to fallback, we must update it.
prev_algo.algo = algo;
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo);
}
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
......@@ -274,9 +327,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < groups; g++)
{
for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionBackwardFilter(
params->handle,
alpha_p,
......@@ -297,7 +348,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
cudnnGetErrorString(err));
return 1;
}
......
......@@ -399,7 +399,7 @@ class DnnBase(COp):
return []
def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 1)
return (super(DnnBase, self).c_code_cache_version(), version(), 4)
class GpuDnnConvDesc(COp):
......
......@@ -2287,6 +2287,152 @@ def dconvgi(border_mode, subsample, filter_dilation, num_groups):
return dconvi
class TestDnnConv2DRuntimeAlgorithms(object):
ndim = 2
cpu_conv_class = theano.tensor.nnet.corr.CorrMM
runtime_shapes = [
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]), # cache should be used
(1, [(1, 2, 3, 4), (6, 2, 2, 1)])
]
def __init__(self):
self.runtime_algorithms = ('time_once', 'guess_once', 'time_on_shape_change', 'guess_on_shape_change')
def test_fwd_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_fwd_runtime_algorithm(algo):
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
inputs /= 10
filters /= 10
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
f = theano.function([inputs, filters], conv, mode=mode_with_gpu)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
f_ref = theano.function([inputs, filters], conv_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, gpu_res)
for algo in self.runtime_algorithms:
yield (run_fwd_runtime_algorithm, algo)
def test_gradinput_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradinput_runtime_algorithm(algo):
theano.config.dnn.conv.algo_bwd_data = algo
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
inputs /= 10
filters /= 10
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
grad_i = theano.tensor.grad(conv.sum(), [inputs])
f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, dnn.GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs])
f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, gpu_res)
for algo in self.runtime_algorithms:
yield (run_gradinput_runtime_algorithm, algo)
def test_gradweight_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradweight_runtime_algorithm(algo):
theano.config.dnn.conv.algo_bwd_filter = algo
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
inputs /= 10
filters /= 10
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
grad_w = theano.tensor.grad(conv.sum(), [filters])
f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, dnn.GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters])
f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, gpu_res)
for algo in self.runtime_algorithms:
yield (run_gradweight_runtime_algorithm, algo)
class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
ndim = 3
runtime_shapes = [
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]), # cache should be used
(1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)])
]
class Cudnn_grouped_conv(Grouped_conv_noOptim):
mode = mode_with_gpu
conv = staticmethod(dconvfwd)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论