提交 4ad36ddc authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3788 from nouiz/carriepl-v4

Rebased cudnn v4
...@@ -219,30 +219,91 @@ AddConfigVar('gpuarray.sync', ...@@ -219,30 +219,91 @@ AddConfigVar('gpuarray.sync',
BoolParam(False), BoolParam(False),
in_c_key=True) in_c_key=True)
def safe_no_dnn_workmem(workmem):
"""
Make sure the user is not attempting to use dnn.conv.workmem`.
"""
if workmem:
raise RuntimeError(
'The option `dnn.conv.workmem` has been removed and should '
'not be used anymore. Please use the option '
'`dnn.conv.algo_fwd` instead.')
return True
AddConfigVar('dnn.conv.workmem', AddConfigVar('dnn.conv.workmem',
"This flag is deprecated; use dnn.conv.algo_fwd.", "This flag is deprecated; use dnn.conv.algo_fwd.",
EnumStr(''), ConfigParam('', allow_override=False, filter=safe_no_dnn_workmem),
in_c_key=False) in_c_key=False)
def safe_no_dnn_workmem_bwd(workmem):
"""
Make sure the user is not attempting to use dnn.conv.workmem_bwd`.
"""
if workmem:
raise RuntimeError(
'The option `dnn.conv.workmem_bwd` has been removed and '
'should not be used anymore. Please use the options '
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
AddConfigVar('dnn.conv.workmem_bwd', AddConfigVar('dnn.conv.workmem_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd.", "This flag is deprecated; use dnn.conv.algo_bwd.",
EnumStr(''), ConfigParam('', allow_override=False,
filter=safe_no_dnn_workmem_bwd),
in_c_key=False)
def safe_no_dnn_algo_bwd(algo):
"""
Make sure the user is not attempting to use dnn.conv.algo_bwd`.
"""
if algo:
raise RuntimeError(
'The option `dnn.conv.algo_bwd` has been removed and '
'should not be used anymore. Please use the options '
'`dnn.conv.algo_bwd_filter` and `dnn.conv.algo_bwd_data` instead.')
return True
AddConfigVar('dnn.conv.algo_bwd',
"This flag is deprecated; use dnn.conv.algo_bwd_data and "
"dnn.conv.algo_bwd_filter.",
ConfigParam('', allow_override=False,
filter=safe_no_dnn_algo_bwd),
in_c_key=False) in_c_key=False)
AddConfigVar('dnn.conv.algo_fwd', AddConfigVar('dnn.conv.algo_fwd',
"Default implementation to use for CuDNN forward convolution.", "Default implementation to use for CuDNN forward convolution.",
EnumStr('small', 'none', 'large', 'fft', 'guess_once', EnumStr('small', 'none', 'large', 'fft', 'fft_tiling',
'guess_on_shape_change', 'time_once', 'guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change'),
in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd_data',
"Default implementation to use for CuDNN backward convolution to "
"get the gradients of the convolution with regard to the inputs.",
EnumStr('none', 'deterministic', 'fft', 'fft_tiling',
'guess_once', 'guess_on_shape_change', 'time_once',
'time_on_shape_change'), 'time_on_shape_change'),
in_c_key=False) in_c_key=False)
AddConfigVar('dnn.conv.algo_bwd', AddConfigVar('dnn.conv.algo_bwd_filter',
"Default implementation to use for CuDNN backward convolution.", "Default implementation to use for CuDNN backward convolution to "
EnumStr('none', 'deterministic', 'fft', 'guess_once', "get the gradients of the convolution with regard to the "
"filters.",
EnumStr('none', 'deterministic', 'fft', 'small', 'guess_once',
'guess_on_shape_change', 'time_once', 'guess_on_shape_change', 'time_once',
'time_on_shape_change'), 'time_on_shape_change'),
in_c_key=False) in_c_key=False)
AddConfigVar('dnn.conv.precision',
"Default data precision to use for the computation in CuDNN "
"convolutions (defaults to the same dtype as the inputs of the "
"convolutions).",
EnumStr('as_input', 'float16', 'float32', 'float64'),
in_c_key=False)
def default_dnn_path(suffix): def default_dnn_path(suffix):
def f(suffix=suffix): def f(suffix=suffix):
......
...@@ -3,6 +3,15 @@ ...@@ -3,6 +3,15 @@
#include <cudnn.h> #include <cudnn.h>
// If needed, define element of the V4 interface in terms of elements of
// previous versions
#if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
#define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
#endif
#ifndef CUDNN_VERSION #ifndef CUDNN_VERSION
#include <assert.h> #include <assert.h>
......
差异被折叠。
...@@ -15,11 +15,8 @@ int APPLY_SPECIFIC(previous_kerns_shape)[5]; ...@@ -15,11 +15,8 @@ int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5]; int APPLY_SPECIFIC(previous_output_shape)[5];
bool APPLY_SPECIFIC(previous_algo_set); bool APPLY_SPECIFIC(previous_algo_set);
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo); cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo); cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo); cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
#endif
#section init_code_struct #section init_code_struct
...@@ -55,10 +52,8 @@ APPLY_SPECIFIC(previous_algo_set) = false; ...@@ -55,10 +52,8 @@ APPLY_SPECIFIC(previous_algo_set) = false;
// Select default implementations for the case where the convolution // Select default implementations for the case where the convolution
// implementations should be selected based on the size of the data. // implementations should be selected based on the size of the data.
APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#endif
#section cleanup_code_struct #section cleanup_code_struct
......
...@@ -81,7 +81,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -81,7 +81,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// CuDNN time every implementation and choose the best one. // CuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME) if (CHOOSE_ALGO_TIME)
{ {
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
// Time the different implementations to choose the best one // Time the different implementations to choose the best one
int requestedCount = 1; int requestedCount = 1;
int count; int count;
...@@ -102,7 +101,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -102,7 +101,6 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
} }
chosen_algo = choosen_algo_perf.algo; chosen_algo = choosen_algo_perf.algo;
#endif
} }
else else
{ {
...@@ -161,24 +159,28 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -161,24 +159,28 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo = CONV_ALGO; chosen_algo = CONV_ALGO;
} }
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
// The FFT implementation (only in V3 and onward) does not support strides, // The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024. // 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used // The tiled-FFT implementation (only in V4 onward) does not support
// on the current data and default on a safe implementation if it // strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default on a safe implementation if it
// can't. // can't.
// Following code is 2d-specific, but it is fine as ftt is defined only for // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
// 2d-filters // defined only for 2d-filters
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4) if ((chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && nb_dim == 4)
{ {
// Extract the properties of the convolution descriptor // Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y; int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w, cudnnDataType_t data_type;
&stride_v, &stride_h, err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
&upscale_x, &upscale_y, upscale, &mode, &data_type);
&mode);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -197,36 +199,23 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -197,36 +199,23 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// Ensure that the selected implementation supports the requested // Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise. // convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 || if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1)) input_w > 1024 || (filter_h == 1 && filter_w == 1))
{ {
chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} }
} }
#endif else
{
#if defined(CUDNN_VERSION) && CUDNN_VERSION < 3000 // chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
// In versions before V3, CuDNN did not support kernels larger than the if (stride[0] != 1 || stride[1] != 1)
// inputs in any spatial dimension, even if padding was used such that the {
// padded inputs were larger than the kernels. If the kernels are larger chosen_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
// then the inputs, raise an error message. }
bool shape_mismatch = false;
for (int i=2; i < nb_dim; i++){
shape_mismatch = shape_mismatch || (CudaNdarray_HOST_DIMS(kerns)[i] >
CudaNdarray_HOST_DIMS(input)[i]);
} }
if (shape_mismatch){
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: the current version of CuDNN does not support "
"kernels larger than the inputs in any spatial dimension, "
"even if the inputs are padded such that the padded inputs "
"are larger than the kernels. Update your installation of "
"CuDNN to V3 or more recent to solve the issue.");
return 1;
} }
#endif
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
......
...@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
{ {
size_t worksize; size_t worksize;
void *workspace; void *workspace;
...@@ -159,21 +158,28 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -159,21 +158,28 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
chosen_algo = CONV_ALGO; chosen_algo = CONV_ALGO;
} }
// The FFT implementation (only in v3 and onward) does not support strides, // The FFT implementation (only in V3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024. // 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used // The tiled-FFT implementation (only in V4 onward) does not support
// on the current data and default on a safe implementation if it // strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default on a safe implementation if it
// can't. // can't.
if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4) // Following code is 2d-specific, but it is fine as FFT and tiled-FFT are
// defined only for 2d-filters
if ((chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && nb_dim == 4)
{ {
// Extract the properties of the convolution descriptor // Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y; int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w, cudnnDataType_t data_type;
&stride_v, &stride_h, err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
&upscale_x, &upscale_y, upscale, &mode, &data_type);
&mode);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -192,12 +198,23 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -192,12 +198,23 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
// Ensure that the selected implementation supports the requested // Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise. // convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 || if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1)) input_w > 1024 || (filter_h == 1 && filter_w == 1))
{ {
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
} }
} }
else
{
// chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
// Infer required workspace size from the chosen implementation // Infer required workspace size from the chosen implementation
err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle, err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
...@@ -231,16 +248,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -231,16 +248,6 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
(void *)&beta, (void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
} }
#else
err = cudnnConvolutionBackwardData(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
......
...@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -33,7 +33,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filterNd(*kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
{ {
size_t worksize; size_t worksize;
void *workspace; void *workspace;
...@@ -168,12 +167,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -168,12 +167,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
{ {
// Extract the properties of the convolution descriptor // Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y; int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w, cudnnDataType_t data_type;
&stride_v, &stride_h, err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
&upscale_x, &upscale_y, upscale, &mode, &data_type);
&mode);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
...@@ -192,7 +193,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -192,7 +193,7 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
// Ensure that the selected implementation supports the requested // Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise. // convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 || if (stride[0] != 1 || stride[1] != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1)) input_w > 1024 || (filter_h == 1 && filter_w == 1))
{ {
chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; chosen_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
...@@ -232,16 +233,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -232,16 +233,6 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns)); APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
} }
#else
err = cudnnConvolutionBackwardFilter(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(input),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
(void *)&beta,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(*kerns));
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradW: error doing operation: %s",
......
...@@ -29,7 +29,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, ...@@ -29,7 +29,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
return -1; return -1;
} }
err = cudnnSetConvolutionNdDescriptor(*desc, NB_DIMS, pad, strides, upscale, err = cudnnSetConvolutionNdDescriptor_v3(*desc, NB_DIMS, pad, strides,
CONV_MODE); upscale, CONV_MODE, PRECISION);
return 0; return 0;
} }
...@@ -13,99 +13,12 @@ static inline int cudnnGetVersion() { ...@@ -13,99 +13,12 @@ static inline int cudnnGetVersion() {
#include <assert.h> #include <assert.h>
#if CUDNN_VERSION < 3000 // If needed, define element of the V4 interface in terms of elements of
// Here we define the R3 API in terms of functions in the R2 interface // previous versions
// This is only for what we use #if defined(CUDNN_VERSION) && CUDNN_VERSION < 4000
typedef int cudnnConvolutionBwdDataAlgo_t; #define CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING 5
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING 3
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 0
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 1
#define CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT 2
static cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize(
cudnnHandle_t handle,
const cudnnFilterDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnTensorDescriptor_t gradDesc,
cudnnConvolutionBwdDataAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static cudnnStatus_t cudnnConvolutionBackwardData_v3(
cudnnHandle_t handle,
const void *alpha,
const cudnnFilterDescriptor_t filterDesc,
const void *filterData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdDataAlgo_t algo,
void *workspace,
size_t workspaceSizeInBytes,
const void *beta,
const cudnnTensorDescriptor_t gradDesc,
void *gradData) {
return cudnnConvolutionBackwardData(
handle,
alpha,
filterDesc,
filterData,
diffDesc,
diffData,
convDesc,
beta,
gradDesc,
gradData);
}
typedef int cudnnConvolutionBwdFilterAlgo_t;
#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 0
#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 1
#define CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT 2
static cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t filterDesc,
const cudnnTensorDescriptor_t diffDesc,
const cudnnConvolutionDescriptor_t convDesc,
const cudnnFilterDescriptor_t gradDesc,
cudnnConvolutionBwdDataAlgo_t algo,
size_t *sizeInBytes) {
*sizeInBytes = 0;
return CUDNN_STATUS_SUCCESS;
}
static cudnnStatus_t cudnnConvolutionBackwardFilter_v3(
cudnnHandle_t handle,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t diffDesc,
const void *diffData,
const cudnnConvolutionDescriptor_t convDesc,
cudnnConvolutionBwdFilterAlgo_t algo,
void *workspace,
size_t workspaceSizeInBytes,
const void *beta,
const cudnnFilterDescriptor_t gradDesc,
void *gradData) {
return cudnnConvolutionBackwardFilter(
handle,
alpha,
srcDesc,
srcData,
diffDesc,
diffData,
convDesc,
beta,
gradDesc,
gradData);
}
#endif #endif
......
...@@ -136,15 +136,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -136,15 +136,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM)) algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#if CUDNN_VERSION > 3000 // The FFT implementation does not support strides, 1x1 filters or inputs
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { // with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd; int nd;
int pad[2]; int pad[2];
int stride[2]; int stride[2];
int upscale[2]; int upscale[2];
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, cudnnDataType_t data_type;
upscale, &mode); err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
...@@ -153,30 +164,24 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -153,30 +164,24 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 || if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
} }
} }
#endif else
{
#if CUDNN_VERSION < 3000 // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
/* cuDNN before v3 does not support kernels larger than input even if (stride[0] != 1 || stride[1] != 1)
* if appropriate padding is selected. */ {
for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) { }
PyErr_SetString(PyExc_RuntimeError, "the current version "
"of CuDNN does not support kernels larger than the "
"inputs in any spatial dimension, even if the inputs "
"are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue.");
cuda_exit(c->ctx);
return 1;
} }
} }
#endif
{ {
size_t worksize; size_t worksize;
......
...@@ -128,15 +128,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -128,15 +128,26 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#endif #endif
#if CUDNN_VERSION > 3000 // The FFT implementation does not support strides, 1x1 filters or inputs
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) { // with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd; int nd;
int pad[2]; int pad[2];
int stride[2]; int stride[2];
int upscale[2]; int upscale[2];
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, cudnnDataType_t data_type;
upscale, &mode); err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
...@@ -145,13 +156,24 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -145,13 +156,24 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
} }
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{
if (stride[0] != 1 || stride[1] != 1 || if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 || PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
} }
} }
#endif else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
......
...@@ -130,15 +130,24 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -130,15 +130,24 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#endif #endif
#if CUDNN_VERSION > 3000 // The FFT implementation does not support strides, 1x1 filters or inputs
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT) { // with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd; int nd;
int pad[2]; int pad[2];
int stride[2]; int stride[2];
int upscale[2]; int upscale[2];
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, cudnnDataType_t data_type;
upscale, &mode); err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s", "error getting convolution properties: %s",
...@@ -153,7 +162,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -153,7 +162,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
} }
} }
#endif
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论