提交 e6b96472 authored 作者: notoraptor's avatar notoraptor

Add DEBUG messages into cuDNN conv codes.

Fix docstring for c_init_code_struct. Update support_code_struct for dnn_gi.
上级 743f7aa9
......@@ -442,7 +442,7 @@ class CLinkerOp(CLinkerObject):
The subclass does not override this method.
"""
raise utils.MethodNotDefined("c_init_code_apply", type(self),
raise utils.MethodNotDefined("c_init_code_struct", type(self),
self.__class__.__name__)
def c_support_code_struct(self, node, name):
......
......@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx);
......@@ -138,6 +141,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1;
}
algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
return 1;
} else {
fprintf(stderr, " (%d best-timed conv fwd algorithms) ", count);
}
#endif
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
......@@ -156,6 +169,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo = prev_algo;
}
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s) ", algorithm_name);
else
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
......@@ -164,15 +188,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
}
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
......@@ -180,14 +195,28 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
!(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
if (cudnnGetVersion() < 6100 &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536)
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
{
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
......@@ -197,6 +226,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
......@@ -245,7 +280,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
// TODO: Print a warning
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
fprintf(stderr, "(%s error getting worksize: "
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
......
......@@ -13,8 +13,8 @@ if (PARAMS->choose_algo) {
int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5] = {0};
size_t prev_top_dims[5] = {0};
size_t prev_kern_dims[5];
size_t prev_top_dims[5];
int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
......@@ -82,6 +82,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx);
......@@ -178,6 +181,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
return 1;
} else {
fprintf(stderr, " (%d best-timed conv gradinput algorithms) ", count);
}
#endif
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
......@@ -195,6 +208,18 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
algo = prev_algo;
}
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s) ", algorithm_name);
else
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
......@@ -203,15 +228,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
}
// The FFT implementation does not support strides, 1x1 filters or inputs
......
......@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx);
......@@ -180,6 +183,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
}
algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
return 1;
} else {
fprintf(stderr, " (%d best-timed conv gradweight algorithms) ", count);
}
#endif
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
......@@ -198,6 +211,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = prev_algo;
}
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s) ", algorithm_name);
else
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
......@@ -206,15 +230,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
}
// The FFT implementation does not support strides, 1x1 filters or inputs
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论