提交 c4df0b99 authored 作者: --global's avatar --global

Modify GpuDnnConvGradW to support timing feature

上级 a5fe8e38
...@@ -695,8 +695,8 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -695,8 +695,8 @@ class GpuDnnConvGradW(DnnBase, COp):
def __init__(self, inplace=False, workmem=None, algo=None): def __init__(self, inplace=False, workmem=None, algo=None):
""" """
:param workmem: *deprecated*, use param algo instead :param workmem: *deprecated*, use param algo instead
:param algo: either 'none', 'deterministic', 'fft', 'guess_once' or :param algo: either 'none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change'. 'guess_on_shape_change', 'time_once' or 'time_on_shape_change'.
Default is the value of :attr:`config.dnn.conv.algo_bwd`. Default is the value of :attr:`config.dnn.conv.algo_bwd`.
""" """
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gw.c"],
...@@ -716,7 +716,8 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -716,7 +716,8 @@ class GpuDnnConvGradW(DnnBase, COp):
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
assert self.algo in ['none', 'deterministic', 'fft', 'guess_once', assert self.algo in ['none', 'deterministic', 'fft', 'guess_once',
'guess_on_shape_change'] 'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -752,29 +753,42 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -752,29 +753,42 @@ class GpuDnnConvGradW(DnnBase, COp):
else: else:
inplace_def = [] inplace_def = []
alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '0') choose_alg = '0'
choose_alg_once = '0'
choose_alg_time = '0'
if version() == -1 or version() < (3000, 3000): if version() == -1 or version() < (3000, 3000):
alg_def = ('CONV_ALGO', '0') alg = "0"
alg_choose_def = ('CHOOSE_ALGO', '0')
else: else:
if self.algo == 'none': if self.algo == 'none':
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0') alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
alg_choose_def = ('CHOOSE_ALGO', '0')
elif self.algo == 'deterministic': elif self.algo == 'deterministic':
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1') alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1'
alg_choose_def = ('CHOOSE_ALGO', '0')
elif self.algo == 'fft': elif self.algo == 'fft':
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT') alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT'
alg_choose_def = ('CHOOSE_ALGO', '0')
elif self.algo in ['guess_once', 'guess_on_shape_change']: elif self.algo in ['guess_once', 'guess_on_shape_change']:
# The convolution implementation should be choosen according # The convolution implementation should be chosen according
# to a heuristic # to a heuristic
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0') alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
alg_choose_def = ('CHOOSE_ALGO', '1') choose_alg = '1'
if self.algo == 'guess_once': if self.algo == 'guess_once':
alg_choose_once_def = ('CHOOSE_ALGO_ONCE', '1') choose_alg_once = '1'
elif self.algo in ['time_once', 'guess_on_shape_change']:
# The convolution implementation should be chosen according
# to timing
alg = 'CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0'
choose_alg = '1'
choose_alg_time = '1'
if self.algo == 'time_once':
choose_alg_once = '1'
return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def] alg_def = ('CONV_ALGO', alg)
alg_choose_def = ('CHOOSE_ALGO', choose_alg)
alg_choose_once_def = ('CHOOSE_ALGO_ONCE', choose_alg_once)
alg_choose_time_def = ('CHOOSE_ALGO_TIME', choose_alg_time)
return inplace_def + [alg_def, alg_choose_def, alg_choose_once_def,
alg_choose_time_def]
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
...@@ -828,7 +842,8 @@ class GpuDnnConv3dGradW(GpuDnnConvGradW): ...@@ -828,7 +842,8 @@ class GpuDnnConv3dGradW(GpuDnnConvGradW):
super(GpuDnnConv3dGradW, self).__init__(inplace=inplace, super(GpuDnnConv3dGradW, self).__init__(inplace=inplace,
algo='none') algo='none')
assert self.algo in ['none', 'guess_once', 'guess_on_shape_change'] assert self.algo in ['none', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp img, top, output, desc, alpha, beta = inp
......
...@@ -42,8 +42,8 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -42,8 +42,8 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
if (CHOOSE_ALGO) if (CHOOSE_ALGO)
{ {
// A new convolution implementation should be selected, based on // A new convolution implementation should be selected, based either on
// heuristics, if in one of the two following cases : // timing or heuristics, if in one of the two following cases :
// - The implementation should only be chosen during the first execution // - The implementation should only be chosen during the first execution
// of an apply node and this is the first execution of the apply node. // of an apply node and this is the first execution of the apply node.
// - The implementation should be chosen as often as necessary and the // - The implementation should be chosen as often as necessary and the
...@@ -75,35 +75,64 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -75,35 +75,64 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
// new one based on the shapes of the current inputs // new one based on the shapes of the current inputs
if (!reuse_previous_algo) if (!reuse_previous_algo)
{ {
// Choose the convolution implementation using heuristics based on the // Obtain a convolution algorithm appropriate for the input and output
// shapes of the inputs and the amount of memory available. // shapes. Either by choosing one according to heuristics or by making
// CuDNN time every implementation and choose the best one.
// Get the amount of available memory if (CHOOSE_ALGO_TIME)
size_t free = 0, total = 0; {
cudaError_t err2 = cudaMemGetInfo(&free, &total); // Time the different implementations to choose the best one
if (err2 != cudaSuccess){ int requestedCount = 1;
cudaGetLastError(); int count;
fprintf(stderr, cudnnConvolutionBwdFilterAlgoPerf_t choosen_algo_perf;
"Error when trying to find the memory information" err = cudnnFindConvolutionBackwardFilterAlgorithm(_handle,
" on the GPU: %s\n", cudaGetErrorString(err2)); APPLY_SPECIFIC(input),
return 1; APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(kerns),
requestedCount,
&count,
&choosen_algo_perf);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradW: error selecting convolution algo: "
"%s", cudnnGetErrorString(err));
return 1;
}
chosen_algo = choosen_algo_perf.algo;
} }
else
// Use heuristics to choose the implementation {
err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle, // Choose the convolution implementation using heuristics based on the
APPLY_SPECIFIC(input), // shapes of the inputs and the amount of memory available.
APPLY_SPECIFIC(output),
desc, // Get the amount of available memory
APPLY_SPECIFIC(kerns), size_t free = 0, total = 0;
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, cudaError_t err2 = cudaMemGetInfo(&free, &total);
free, if (err2 != cudaSuccess){
&chosen_algo); cudaGetLastError();
fprintf(stderr,
if (err != CUDNN_STATUS_SUCCESS) { "Error when trying to find the memory information"
PyErr_Format(PyExc_RuntimeError, " on the GPU: %s\n", cudaGetErrorString(err2));
"GpuDnnConvGradW: error selecting convolution algo: %s", return 1;
cudnnGetErrorString(err)); }
return 1;
// Use heuristics to choose the implementation
err = cudnnGetConvolutionBackwardFilterAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_SPECIFY_WORKSPACE_LIMIT,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradW: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
} }
// Store the shapes of the inputs and kernels as well as the chosen // Store the shapes of the inputs and kernels as well as the chosen
...@@ -129,8 +158,8 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -129,8 +158,8 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
chosen_algo = CONV_ALGO; chosen_algo = CONV_ALGO;
} }
// The FFT implementation does not support strides, 1x1 filters or // The FFT implementation (only in v3 and onward) does not support strides,
// inputs with a spatial dimension larger than 1024. // 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used // If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it // on the current data and default on a safe implementation if it
// can't. // can't.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论