提交 a9a105f6 authored 作者: Pascal Lamblin's avatar Pascal Lamblin 提交者: GitHub

Merge pull request #6166 from notoraptor/fixes-and-debug-messages

Add DEBUG messages into cuDNN conv codes.
...@@ -442,7 +442,7 @@ class CLinkerOp(CLinkerObject): ...@@ -442,7 +442,7 @@ class CLinkerOp(CLinkerObject):
The subclass does not override this method. The subclass does not override this method.
""" """
raise utils.MethodNotDefined("c_init_code_apply", type(self), raise utils.MethodNotDefined("c_init_code_struct", type(self),
self.__class__.__name__) self.__class__.__name__)
def c_support_code_struct(self, node, name): def c_support_code_struct(self, node, name):
......
#section init_code_struct #section init_code_struct
if (PARAMS->choose_algo) { reuse_algo = 0;
reuse_algo = 0; prev_algo = PARAMS->conv_algo;
prev_algo = PARAMS->conv_algo; memset(prev_img_dims, 0, sizeof(prev_img_dims));
if (!PARAMS->choose_once) { memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
}
}
#section support_code_struct #section support_code_struct
...@@ -83,6 +79,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -83,6 +79,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -138,6 +137,19 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -138,6 +137,19 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
algo = choice.algo; algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed FWD algo: %s",
cudnnGetErrorString(choice.status));
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
} else { } else {
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
...@@ -156,6 +168,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -156,6 +168,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo = prev_algo; algo = prev_algo;
} }
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} else { } else {
...@@ -164,15 +186,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -164,15 +186,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
} }
} }
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
} }
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
...@@ -180,14 +193,27 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -180,14 +193,27 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
!(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0. // Issue should be resolved for cuDNN > V6.0.
if (cudnnGetVersion() < 6100 && if (cudnnGetVersion() < 6100 &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536) PyGpuArray_DIM(input, 0) > 65536)
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; {
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation // with a spatial dimension larger than 1024. The tiled-FFT implementation
...@@ -197,6 +223,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -197,6 +223,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// can't. // can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters // defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
...@@ -245,7 +277,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -245,7 +277,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (err == CUDNN_STATUS_NOT_SUPPORTED) { if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported // Fallback to none algo if not supported
// TODO: Print a warning
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s error getting worksize: "
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
......
#section init_code_struct #section init_code_struct
if (PARAMS->choose_algo) { reuse_algo = 0;
reuse_algo = 0; prev_algo = PARAMS->conv_algo;
prev_algo = PARAMS->conv_algo; memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
if (!PARAMS->choose_once) { memset(prev_top_dims, 0, sizeof(prev_top_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
}
#section support_code_struct #section support_code_struct
int reuse_algo; int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo; cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5] = {0}; size_t prev_kern_dims[5];
size_t prev_top_dims[5] = {0}; size_t prev_top_dims[5];
int int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...@@ -82,6 +78,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -82,6 +78,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -178,6 +177,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -178,6 +177,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradinput algo: %s",
cudnnGetErrorString(choice.status));
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
} else { } else {
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
...@@ -195,6 +207,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -195,6 +207,17 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
algo = prev_algo; algo = prev_algo;
} }
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} else { } else {
...@@ -203,15 +226,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -203,15 +226,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
prev_top_dims[i] = PyGpuArray_DIM(output, i); prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
} }
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
} }
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
......
#section init_code_struct #section init_code_struct
if (PARAMS->choose_algo) { reuse_algo = 0;
reuse_algo = 0; prev_algo = PARAMS->conv_algo;
prev_algo = PARAMS->conv_algo; memset(prev_img_dims, 0, sizeof(prev_img_dims));
if (!PARAMS->choose_once) { memset(prev_top_dims, 0, sizeof(prev_top_dims));
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
}
}
#section support_code_struct #section support_code_struct
...@@ -83,6 +79,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -83,6 +79,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -180,6 +179,19 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -180,6 +179,19 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradweight algo: %s",
cudnnGetErrorString(choice.status));
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
} else { } else {
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
...@@ -198,6 +210,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -198,6 +210,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = prev_algo; algo = prev_algo;
} }
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} else { } else {
...@@ -206,15 +228,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -206,15 +228,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
prev_top_dims[i] = PyGpuArray_DIM(output, i); prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
} }
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
} }
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论