提交 89199807 authored 作者: --global's avatar --global

Add 'time_once' and 'guess_once' options for DnnConv3d algo selection

上级 680d2034
......@@ -341,7 +341,8 @@ class GpuDnnConvDesc(GpuOp):
AddConfigVar('dnn.conv.workmem',
"Default value for the workmem attribute of cudnn convolutions.",
EnumStr('small', 'none', 'large', 'fft', 'guess', 'time'),
EnumStr('small', 'none', 'large', 'fft', 'guess',
'guess_once', 'time', 'time_once'),
in_c_key=False)
AddConfigVar('dnn.conv.workmem_bwd',
......@@ -399,8 +400,9 @@ class GpuDnnConv(DnnBase, COp):
def __init__(self, workmem=None, inplace=False):
"""
:param workmem: either 'none', 'small', 'large', 'fft', 'time' or
'guess'. Default is the value of :attr:`config.dnn.conv.workmem`.
:param workmem: either 'none', 'small', 'large', 'fft', 'time',
'time_once', 'guess' or 'guess_once'. Default is the value of
:attr:`config.dnn.conv.workmem`.
"""
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)")
......@@ -421,7 +423,7 @@ class GpuDnnConv(DnnBase, COp):
raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
assert self.workmem in ['none', 'small', 'large', 'fft', 'time',
'guess']
'time_once', 'guess', 'guess_once']
def __setstate__(self, d):
self.__dict__.update(d)
......@@ -437,6 +439,7 @@ class GpuDnnConv(DnnBase, COp):
inpl_def = []
choose_alg = '0'
choose_alg_once = '0'
choose_alg_time = '0'
if version() == -1:
alg = "0"
......@@ -449,23 +452,29 @@ class GpuDnnConv(DnnBase, COp):
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
elif self.workmem == 'fft':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
elif self.workmem == 'guess':
elif self.workmem in ['guess', 'guess_once']:
# The convolution implementation should be choosen according
# to a heuristic
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
choose_alg = '1'
elif self.workmem == 'time':
if self.workmem == 'guess_once':
choose_alg_once = '1'
elif self.workmem in ['time', 'time_once']:
# The convolution implementation should be choosen by timing
# every available implementation
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
choose_alg = '1'
choose_alg_time = '1'
if self.workmem == 'time_once':
choose_alg_once = '1'
alg_def = ('CONV_ALGO', alg)
alg_choose_def = ('CHOOSE_ALGO', choose_alg)
alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
return [alg_def, alg_choose_def, alg_choose_once_def,
alg_choose_time_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img)
......@@ -556,11 +565,13 @@ class GpuDnnConv3d(GpuDnnConv):
def __init__(self, workmem=None, inplace=False):
"""
:param workmem: either 'none', 'time' or 'guess'.
:param workmem: either 'none', 'time', 'time_once', 'guess' or
'guess_once'.
Default is the value of :attr:`config.dnn.conv.workmem`.
"""
super(GpuDnnConv3d, self).__init__(workmem='guess', inplace=inplace)
assert self.workmem in ['none', 'time','guess']
assert self.workmem in ['none', 'time', 'time_once', 'guess',
'guess_once']
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
......
......@@ -13,6 +13,7 @@ at V3.
int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5];
bool APPLY_SPECIFIC(previous_algo_set);
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
......@@ -49,6 +50,8 @@ for (int i = 0; i < 5; i++)
APPLY_SPECIFIC(previous_output_shape)[i] = 0;
}
APPLY_SPECIFIC(previous_algo_set) = false;
// Select default implementations for the case where the convolution
// implementations should be selected based on the size of the data.
APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
......
......@@ -41,34 +41,40 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
if (CHOOSE_ALGO)
{
// Check if the input and the kernels have the same shape as they have
// last time the apply node was executed
bool same_shapes = true;
for (int i = 0; (i < nb_dim) && same_shapes; i++)
// A new convolution implementation should be selected, based either on
// timing or heuristics if in one of the two following cases :
// - The implementation should only be chosen during the first execution
// of an apply node and this is the first execution of the apply node.
// - The implementation should be chosen as often as necessary and the
// shapes of the inputs differ from the last time an implementation
// was chosen.
bool reuse_previous_algo;
if (CHOOSE_ALGO_ONCE)
{
// Only choose a new implementation of none has been chosen before.
reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
}
else
{
// Reuse the previous implementation if the inputs and the kernels
// have the same shapes as they had when the previous implementation
// was selected
bool same_shapes = true;
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] ==
APPLY_SPECIFIC(previous_input_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
APPLY_SPECIFIC(previous_kerns_shape)[i]);
}
reuse_previous_algo = same_shapes;
}
if (!same_shapes)
// If the previously choosen implementation can't be reused, select a
// new one based on the shapes of the current inputs
if (!reuse_previous_algo)
{
// The shape of the inputs and/or the kernels is different from the
// last execution. Use the current shapes to infer the implementation
// to use from now on.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Obtain a convolution algorithm appropriate for the input and kernel
// shapes. Either by choosing one according to heuristics or by making
......@@ -100,6 +106,20 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
}
else
{
// The implementation should be chosen using heuristics based on the
// input shapes and the amount of memory available.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Use heuristics to choose the implementation
err = cudnnGetConvolutionForwardAlgorithm(_handle,
APPLY_SPECIFIC(input),
......@@ -131,9 +151,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
}
else
{
// The shapes of the inputs and the kernels are the same as for the
// last execution. The convolution algorithm used last time can also
// be used here
// Reuse the previously chosen convolution implementation
chosen_algo = APPLY_SPECIFIC(previous_algo);
}
}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论