提交 f9b85e1e authored 作者: --global's avatar --global

Implement cudnn implementation selection for FWD pass

上级 2b83b6ac
......@@ -416,12 +416,19 @@ class GpuDnnConv(DnnBase, COp):
else:
if self.workmem == 'none':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
choose_alg = '0'
elif self.workmem == 'small':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
choose_alg = '0'
elif self.workmem == 'large':
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
choose_alg = '0'
elif self.workmem == 'time':
alg = "0"
choose_alg = '1'
alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def
alg_choose_def = ('CHOOSE_ALGO', choose_alg)
return [alg_def, alg_choose_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_cuda_ndarray_variable(img)
......
......@@ -3,6 +3,15 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
/* Keep track, from one execution to another, of the dimension of the inputs
and the algorithm, if any, that was selected according to these dimensions
and the amount of memory available at that time.
*/
int APPLY_SPECIFIC(previous_input_shape)[4];
int APPLY_SPECIFIC(previous_kerns_shape)[4];
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
......@@ -20,11 +29,20 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
APPLY_SPECIFIC(previous_input_shape)[0] = 0;
APPLY_SPECIFIC(previous_input_shape)[1] = 0;
APPLY_SPECIFIC(previous_input_shape)[2] = 0;
APPLY_SPECIFIC(previous_input_shape)[3] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[0] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[1] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[2] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[3] = 0;
#section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL)
......
......@@ -33,13 +33,86 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
{
size_t worksize;
void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo;
if (CHOOSE_ALGO){
// Check if the input and the kernels have the same shape as they have
// last time the apply node was executed
bool same_shapes = true;
for (int i = 0; (i < 4) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
APPLY_SPECIFIC(previous_input_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] !=
APPLY_SPECIFIC(previous_kerns_shape)[i]);
}
if (same_shapes)
{
// The shape of the inputs and/or the kernels is different from the
// last execution. Use the current shapes to infer the implementation
// to use from now on.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
}
// Obtain a convolution algorithm appropriate for the input and kernel
// shapes
err = cudnnGetConvolutionForwardAlgorithm(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
// Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_algo) = chosen_algo;
for (int i = 0; i < 4; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i];
APPLY_SPECIFIC(previous_kerns_shape)[i] =
CudaNdarray_HOST_DIMS(kerns)[i];
}
}
else
{
// The shapes of the inputs and the kernels are the same as for the
// last execution. The convolution algorithm used last time can also
// be used here
chosen_algo = APPLY_SPECIFIC(previous_algo);
}
}
else
{
chosen_algo = CONV_ALGO;
}
err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns),
desc,
APPLY_SPECIFIC(output),
CONV_ALGO,
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
......@@ -58,7 +131,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
desc,
CONV_ALGO,
chosen_algo,
workspace, worksize,
(void *)&beta,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论