提交 e6b96472 authored 作者: notoraptor's avatar notoraptor

Add DEBUG messages into cuDNN conv codes.

Fix docstring for c_init_code_struct. Update support_code_struct for dnn_gi.
上级 743f7aa9
...@@ -442,7 +442,7 @@ class CLinkerOp(CLinkerObject): ...@@ -442,7 +442,7 @@ class CLinkerOp(CLinkerObject):
The subclass does not override this method. The subclass does not override this method.
""" """
raise utils.MethodNotDefined("c_init_code_apply", type(self), raise utils.MethodNotDefined("c_init_code_struct", type(self),
self.__class__.__name__) self.__class__.__name__)
def c_support_code_struct(self, node, name): def c_support_code_struct(self, node, name):
......
...@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -138,6 +141,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -138,6 +141,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
algo = choice.algo; algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
return 1;
} else {
fprintf(stderr, " (%d best-timed conv fwd algorithms) ", count);
}
#endif
} else { } else {
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
...@@ -156,6 +169,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -156,6 +169,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo = prev_algo; algo = prev_algo;
} }
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s) ", algorithm_name);
else
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} else { } else {
...@@ -164,15 +188,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -164,15 +188,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
} }
} }
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
} }
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
...@@ -180,14 +195,28 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -180,14 +195,28 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
!(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0. // Issue should be resolved for cuDNN > V6.0.
if (cudnnGetVersion() < 6100 && if (cudnnGetVersion() < 6100 &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536) PyGpuArray_DIM(input, 0) > 65536)
{
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation // with a spatial dimension larger than 1024. The tiled-FFT implementation
...@@ -197,6 +226,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -197,6 +226,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// can't. // can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are // The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters // defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
...@@ -245,7 +280,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -245,7 +280,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (err == CUDNN_STATUS_NOT_SUPPORTED) { if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported // Fallback to none algo if not supported
// TODO: Print a warning
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
return 1;
};
fprintf(stderr, "(%s error getting worksize: "
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) ", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
......
...@@ -13,8 +13,8 @@ if (PARAMS->choose_algo) { ...@@ -13,8 +13,8 @@ if (PARAMS->choose_algo) {
int reuse_algo; int reuse_algo;
cudnnConvolutionBwdDataAlgo_t prev_algo; cudnnConvolutionBwdDataAlgo_t prev_algo;
size_t prev_kern_dims[5] = {0}; size_t prev_kern_dims[5];
size_t prev_top_dims[5] = {0}; size_t prev_top_dims[5];
int int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...@@ -82,6 +82,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -82,6 +82,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -178,6 +181,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -178,6 +181,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
return 1;
} else {
fprintf(stderr, " (%d best-timed conv gradinput algorithms) ", count);
}
#endif
} else { } else {
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
...@@ -195,23 +208,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -195,23 +208,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
algo = prev_algo; algo = prev_algo;
} }
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) { if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
return 1; return 1;
}; };
// NB: This is printed only when algorithm is chosen at runtime. // NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s) ", algorithm_name);
else
fprintf(stderr, "(using %s) ", algorithm_name); fprintf(stderr, "(using %s) ", algorithm_name);
#endif #endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
} }
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
......
...@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -83,6 +83,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -180,6 +183,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -180,6 +183,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
return 1;
} else {
fprintf(stderr, " (%d best-timed conv gradweight algorithms) ", count);
}
#endif
} else { } else {
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
...@@ -198,6 +211,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -198,6 +211,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = prev_algo; algo = prev_algo;
} }
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s) ", algorithm_name);
else
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} else { } else {
...@@ -206,15 +230,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -206,15 +230,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
prev_top_dims[i] = PyGpuArray_DIM(output, i); prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
} }
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
return 1;
};
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s) ", algorithm_name);
#endif
} }
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论