提交 ad9646be authored 作者: --global's avatar --global

Implement implementation selection for GpuDnnConvGradI

上级 221fb064
...@@ -638,16 +638,25 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -638,16 +638,25 @@ class GpuDnnConvGradI(DnnBase, COp):
:param descr: the convolution descriptor :param descr: the convolution descriptor
""" """
__props__ = ('inplace',) __props__ = ('workmem', 'inplace',)
__input_name__ = ('kernel', 'grad', 'output', __input_name__ = ('kernel', 'grad', 'output',
'descriptor', 'alpha', 'beta') 'descriptor', 'alpha', 'beta')
def __init__(self, inplace=False): def __init__(self, inplace=False, workmem=None):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
if workmem is None:
workmem = config.dnn.conv.workmem_bwd
self.workmem = workmem
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
assert self.workmem in ['none', 'deterministic', 'fft', 'guess']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'workmem'):
self.workmem = 'none'
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta = inp kerns, top, output, desc, alpha, beta = inp
...@@ -669,9 +678,29 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -669,9 +678,29 @@ class GpuDnnConvGradI(DnnBase, COp):
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
return [('CONV_INPLACE', '1')] inplace_def = [('CONV_INPLACE', '1')]
else: else:
return [] inplace_def = []
if version() == -1:
alg_def = ('CONV_ALGO', '0')
else:
if self.workmem == 'none':
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
alg_choose_def = ('CHOOSE_ALGO', '0')
elif self.workmem == 'deterministic':
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_1')
alg_choose_def = ('CHOOSE_ALGO', '0')
elif self.workmem == 'fft':
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT')
alg_choose_def = ('CHOOSE_ALGO', '0')
elif self.workmem == 'guess':
# The convolution implementation should be choosen according
# to a heuristic
alg_def = ('CONV_ALGO', 'CUDNN_CONVOLUTION_BWD_DATA_ALGO_0')
alg_choose_def = ('CHOOSE_ALGO', '1')
return inplace_def + [alg_def, alg_choose_def]
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
......
...@@ -31,14 +31,156 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -31,14 +31,156 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
err = cudnnConvolutionBackwardData( {
_handle, size_t worksize;
(void *)&alpha, void *workspace;
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns), cudnnConvolutionBwdDataAlgo_t chosen_algo;
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc, if (CHOOSE_ALGO)
(void *)&beta, {
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); // Check if the kernels and the output have the same shape as they have
// last time the apply node was executed
bool same_shapes = true;
for (int i = 0; (i < 4) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] !=
APPLY_SPECIFIC(previous_kerns_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] !=
APPLY_SPECIFIC(previous_output_shape)[i]);
}
if (!same_shapes)
{
// The shape of the kernels and/or the output is different from the
// last execution. Use the current shapes to infer the implementation
// to use from now on.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Use heuristics to choose the implementation
err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
// Store the shapes of the kernels and output as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
for (int i = 0; i < 4; i++)
{
APPLY_SPECIFIC(previous_kerns_shape)[i] =
CudaNdarray_HOST_DIMS(kerns)[i];
APPLY_SPECIFIC(previous_output_shape)[i] =
CudaNdarray_HOST_DIMS(output)[i];
}
}
else
{
// The shapes of the kernels and the output are the same as for the
// last execution. The convolution algorithm used last time can also
// be used here
chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
}
}
else
{
chosen_algo = CONV_ALGO;
}
// The FFT implementation does not support strides, 1x1 filters or
// inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// can't.
if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
// Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
&stride_v, &stride_h,
&upscale_x, &upscale_y,
&mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
// Extract the spatial size of the filters
int filter_h = CudaNdarray_HOST_DIMS(kerns)[3];
int filter_w = CudaNdarray_HOST_DIMS(kerns)[4];
// Extract the spatial size of the input
int input_h = CudaNdarray_HOST_DIMS(*input)[3];
int input_w = CudaNdarray_HOST_DIMS(*input)[4];
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
}
// Infer required workspace size from the chosen implementation
err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
// Allocate workspace for the convolution
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
// Perform the convolution
err = cudnnConvolutionBackwardData_v3(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
chosen_algo,
workspace, worksize,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
}
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论