提交 2597dcd2 authored 作者: --global's avatar --global

Add support for implementation timing

上级 f9b85e1e
...@@ -350,7 +350,7 @@ class GpuDnnConvDesc(GpuOp): ...@@ -350,7 +350,7 @@ class GpuDnnConvDesc(GpuOp):
AddConfigVar('dnn.conv.workmem', AddConfigVar('dnn.conv.workmem',
"Default value for the workmem attribute of cudnn convolutions.", "Default value for the workmem attribute of cudnn convolutions.",
EnumStr('small', 'none', 'large'), EnumStr('small', 'none', 'large', 'guess', 'time'),
in_c_key=False) in_c_key=False)
# scalar constants # scalar constants
...@@ -397,7 +397,8 @@ class GpuDnnConv(DnnBase, COp): ...@@ -397,7 +397,8 @@ class GpuDnnConv(DnnBase, COp):
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
assert self.workmem in ['none', 'small', 'large'] assert self.workmem in ['none', 'small', 'large', 'fft', 'time',
'guess']
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -417,18 +418,37 @@ class GpuDnnConv(DnnBase, COp): ...@@ -417,18 +418,37 @@ class GpuDnnConv(DnnBase, COp):
if self.workmem == 'none': if self.workmem == 'none':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
choose_alg = '0' choose_alg = '0'
choose_alg_time = '0'
elif self.workmem == 'small': elif self.workmem == 'small':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
choose_alg = '0' choose_alg = '0'
choose_alg_time = '0'
elif self.workmem == 'large': elif self.workmem == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM' alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
choose_alg = '0' choose_alg = '0'
choose_alg_time = '0'
elif self.workmem == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
choose_alg = '0'
choose_alg_time = '0'
elif self.workmem == 'guess':
# The convolution implementation should be choosen according
# to a heuristic
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
choose_alg = '1'
choose_alg_time = '0'
elif self.workmem == 'time': elif self.workmem == 'time':
alg = "0" # The convolution implementation should be choosen by timing
# every available implementation
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
choose_alg = '1' choose_alg = '1'
choose_alg_time = '1'
alg_def = ('CONV_ALGO', alg) alg_def = ('CONV_ALGO', alg)
alg_choose_def = ('CHOOSE_ALGO', choose_alg) alg_choose_def = ('CHOOSE_ALGO', choose_alg)
return [alg_def, alg_choose_def] + inpl_def alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None, beta=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
......
...@@ -42,6 +42,7 @@ APPLY_SPECIFIC(previous_kerns_shape)[0] = 0; ...@@ -42,6 +42,7 @@ APPLY_SPECIFIC(previous_kerns_shape)[0] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[1] = 0; APPLY_SPECIFIC(previous_kerns_shape)[1] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[2] = 0; APPLY_SPECIFIC(previous_kerns_shape)[2] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[3] = 0; APPLY_SPECIFIC(previous_kerns_shape)[3] = 0;
APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
#section cleanup_code_struct #section cleanup_code_struct
......
...@@ -35,7 +35,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -35,7 +35,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
void *workspace; void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo; cudnnConvolutionFwdAlgo_t chosen_algo;
if (CHOOSE_ALGO){ if (CHOOSE_ALGO)
{
// Check if the input and the kernels have the same shape as they have // Check if the input and the kernels have the same shape as they have
// last time the apply node was executed // last time the apply node was executed
...@@ -48,7 +49,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -48,7 +49,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(previous_kerns_shape)[i]); APPLY_SPECIFIC(previous_kerns_shape)[i]);
} }
if (same_shapes) if (!same_shapes)
{ {
// The shape of the inputs and/or the kernels is different from the // The shape of the inputs and/or the kernels is different from the
// last execution. Use the current shapes to infer the implementation // last execution. Use the current shapes to infer the implementation
...@@ -62,10 +63,32 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -62,10 +63,32 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
fprintf(stderr, fprintf(stderr,
"Error when trying to find the memory information" "Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2)); " on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
} }
// Obtain a convolution algorithm appropriate for the input and kernel // Obtain a convolution algorithm appropriate for the input and kernel
// shapes // shapes. Either by choosing one according to heuristics or by making
// CuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME)
{
// Time the different implementations to choose the best one
int requestedCount = 2;
int count;
cudnnConvolutionFwdAlgoPerf_t choosen_algo_perf;
err = cudnnFindConvolutionForwardAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
requestedCount,
&count,
&choosen_algo_perf);
chosen_algo = choosen_algo_perf.algo;
fprintf(stdout, "Choose algo %i\n", chosen_algo);
}
else
{
// Use heuristics to choose the implementation
err = cudnnGetConvolutionForwardAlgorithm(_handle, err = cudnnGetConvolutionForwardAlgorithm(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
...@@ -74,6 +97,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -74,6 +97,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
free, free,
&chosen_algo); &chosen_algo);
}
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论