提交 8cb9d50e authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #3245 from carriepl/v3

CuDNN v3
...@@ -36,12 +36,61 @@ To get an error if Theano can not use cuDNN, use this Theano flag: ...@@ -36,12 +36,61 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
.. note:: .. note::
CuDNN v2 is now released, if you used any v2 release candidate, we CuDNN v3 has now been released. CuDNN v2 remains supported but CuDNN v3 is
strongly suggest that you update it to the final version. From now faster and offers many more options. We recommend that everybody update to
on, we only support the final release. v3.
.. note::
Starting in CuDNN v3, multiple convolution implementations are offered and
it is possible to use heuristics to automatically choose a convolution
implementation well suited to the parameters of the convolution.
The Theano flag ``dnn.conv.algo_fwd`` allows to specify the CuDNN
convolution implementation that Theano should use for forward convolutions.
Possible values include :
* ``small`` (default) : use a convolution implementation with small memory
usage
* ``none`` : use a slower implementation with minimal memory usage
* ``large`` : use a faster implementation with large memory usage
* ``fft`` : use the Fast Fourrier Transform implementation of convolution
(very high memory usage)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to CuDNN's heuristics and reused
for every subsequent execution of the convolution.
* ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
* ``time_once`` : the first time a convolution is executed, every convolution
implementation offered by CuDNN is executed and timed. The fastest is
reused for every subsequent execution of the convolution.
* ``time_on_shape_change`` : like ``time_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
The Theano flag ``dnn.conv.algo_bwd`` allows to specify the CuDNN
convolution implementation that Theano should use for gradient convolutions.
Possible values include :
* ``none`` (default) : use the default non-deterministic convolution
implementation
* ``deterministic`` : use a slower but deterministic implementation
* ``fft`` : use the Fast Fourrier Transform implementation of convolution
(very high memory usage)
* ``guess_once`` : the first time a convolution is executed, the
implementation to use is chosen according to CuDNN's heuristics and reused
for every subsequent execution of the convolution.
* ``guess_on_shape_change`` : like ``guess_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
* ``time_once`` : the first time a convolution is executed, every convolution
implementation offered by CuDNN is executed and timed. The fastest is
reused for every subsequent execution of the convolution.
* ``time_on_shape_change`` : like ``time_once`` but a new convolution
implementation selected every time the shapes of the inputs and kernels
don't match the shapes from the last execution.
CuDNN v2 is much faster than v1. We recommend that everybody
updates to v2.
.. note:: .. note::
...@@ -51,13 +100,16 @@ To get an error if Theano can not use cuDNN, use this Theano flag: ...@@ -51,13 +100,16 @@ To get an error if Theano can not use cuDNN, use this Theano flag:
.. note:: .. note::
The documentation of CUDNN R1 and R2 tells that, for the following The documentation of CUDNN tells that, for the 2 following operations, the
2 operations, the reproducibility is not guaranteed: reproducibility is not guaranteed with the default implementation:
`cudnnConvolutionBackwardFilter` and `cudnnConvolutionBackwardData`. `cudnnConvolutionBackwardFilter` and `cudnnConvolutionBackwardData`.
Those correspond to the gradient wrt the weights and the gradient wrt the Those correspond to the gradient wrt the weights and the gradient wrt the
input of the convolution. They are also used sometimes in the forward input of the convolution. They are also used sometimes in the forward
pass, when they give a speed up. pass, when they give a speed up.
The Theano flag ``dnn.conv.algo_bwd`` can be use to force the use of a
slower but deterministic convolution implementation.
.. note:: .. note::
There is a problem we do not understand yet when cudnn paths are There is a problem we do not understand yet when cudnn paths are
...@@ -79,7 +131,8 @@ Convolution Ops ...@@ -79,7 +131,8 @@ Convolution Ops
=============== ===============
.. automodule:: theano.sandbox.cuda.dnn .. automodule:: theano.sandbox.cuda.dnn
:members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConv3d, GpuDnnConvGradW,
GpuDnnConv3dGradW, GpuDnnConvGradI, GpuDnnConv3dGradI
Pooling Ops Pooling Ops
=========== ===========
......
...@@ -41,6 +41,20 @@ static inline const char *cudnnGetErrorString(cudnnStatus_t err) { ...@@ -41,6 +41,20 @@ static inline const char *cudnnGetErrorString(cudnnStatus_t err) {
typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t; typedef cudnnTensor4dDescriptor_t cudnnTensorDescriptor_t;
static inline cudnnStatus_t
cudnnSetTensorNdDescriptor(
cudnnTensorDescriptor_t tensorDesc,
cudnnDataType_t dataType,
int nbDims,
const int dimA[],
const int strideA[]) {
if (ndDims != 4) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetTensor4dDescriptorEx(
tensorDesc, dataType,
dimA[0], dimA[1], dimA[2], dimA[3],
strideA[0], strideA[1], strideA[2], strideA[3]);
}
static inline cudnnStatus_t static inline cudnnStatus_t
cudnnGetConvolution2dForwardOutputDim( cudnnGetConvolution2dForwardOutputDim(
const cudnnConvolutionDescriptor_t convDesc, const cudnnConvolutionDescriptor_t convDesc,
...@@ -183,6 +197,85 @@ cudnnConvolutionBackwardData_v2( ...@@ -183,6 +197,85 @@ cudnnConvolutionBackwardData_v2(
#define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2 #define cudnnConvolutionBackwardData cudnnConvolutionBackwardData_v2
static inline cudnnStatus_t
cudnnSetPoolingNdDescriptor(
cudnnPoolingDescriptor_t poolingDesc,
const cudnnPoolingMode_t mode,
int nbDims,
const int windowDimA[],
const int paddingA[],
const in strideA[]) {
if (nbDims != 2) return CUDNN_STATUS_NOT_SUPPORTED;
if (paddingA[0] != 0 || paddingA[1] != 0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnSetPoolingDescriptor(poolingDesc, mode,
windowDimA[0], windowDimA[1],
strideA[0], strideA[1]);
}
static inline cudnnStatus_t
cudnnGetPoolingNdDescriptor(
const cudnnPoolingDescriptor_t poolingDesc,
const int nbDimsRequested,
cudnnPoolingMode_t *mode,
int *nbDims,
int windowA[],
int paddingA[],
int strideA[]) {
int win0, win1, str0, str1;
cudnnStatus_t err;
if (ndDimsRequested < 2) return CUDNN_STATUS_NOT_SUPPORTED;
err = cudnnGetPoolingDescriptor(poolingDesc, mode, &win0, &win1,
&str0, &str1);
if (err != CUDNN_STATUS_SUCCESS) return err;
*nbDims = 2;
paddingA[0] = 0;
paddingA[1] = 0;
windowA[0] = win0;
windowA[1] = win1;
strideA[0] = str0;
strideA[1] = str1;
return CUDNN_STATUS_SUCCESS;
}
static inline cudnnStatus_t
cudnnPoolingForward_v2(
cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const void *beta,
const cudnnTensorDescriptor_t destDesc,
void *destData) {
if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnPoolingForward(handle, poolingDesc, srcDesc, srcData,
destDesc, destData);
}
#define cudnnPoolingForward cudnnPoolingForward_v2
static inline cudnnStatus_t
cudnnPoolingBackward_v2(
cudnnHandle_t handle,
const cudnnPoolingDescriptor_t poolingDesc,
const void *alpha,
const cudnnTensorDescriptor_t srcDesc,
const void *srcData,
const cudnnTensorDescriptor_t srcDiffDesc,
const void *srcDiffData,
const cudnnTensorDescriptor_t destDesc,
const void *destData,
const void *beta,
const cudnnTensorDescriptor_t destDiffDesc,
void *destDiffData) {
if (*(float*)alpha != 1.0 || *(float *)beta != 0.0) return CUDNN_STATUS_NOT_SUPPORTED;
return cudnnPoolingBackward(handle, poolingDesc,
srcDesc, srcData,
srcDiffDesc, srcDiffData,
destDesc, destData,
destDiffDesc, destDiffData);
}
#define cudnnPoolingBackward cudnnPoolingBackward_v2
//Needed for R2 rc2 //Needed for R2 rc2
# define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE # define CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING CUDNN_POOLING_AVERAGE
#else #else
......
差异被折叠。
#section support_code #section support_code
static cudnnHandle_t _handle = NULL; static cudnnHandle_t _handle = NULL;
static int static int
c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) { c_set_tensorNd(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx(
desc, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(var)[0], int dim = CudaNdarray_NDIM(var);
CudaNdarray_HOST_DIMS(var)[1], int strides[dim];
CudaNdarray_HOST_DIMS(var)[2], int default_str = 1;
CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1], for (int i = dim-1; i >= 0; i--)
CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3], {
CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3], if (CudaNdarray_HOST_STRIDES(var)[i])
CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1 strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
); else
strides[i] = default_str;
default_str *= CudaNdarray_HOST_DIMS(var)[i];
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var),
strides);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s" "Could not set tensorNd descriptor: %s"
"shapes=%d %d %d %d strides=%d %d %d %d", "dim=%d",
cudnnGetErrorString(err), cudnnGetErrorString(err), dim);
CudaNdarray_HOST_DIMS(var)[0],
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[0]?CudaNdarray_HOST_STRIDES(var)[0]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3]*CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_STRIDES(var)[1]?CudaNdarray_HOST_STRIDES(var)[1]:CudaNdarray_HOST_DIMS(var)[2]*CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[2]?CudaNdarray_HOST_STRIDES(var)[2]:CudaNdarray_HOST_DIMS(var)[3],
CudaNdarray_HOST_STRIDES(var)[3]?CudaNdarray_HOST_STRIDES(var)[3]:1
);
return -1; return -1;
} }
return 0; return 0;
} }
static int static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) { c_set_filterNd(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) { if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported."); "Only contiguous filters (kernels) are supported.");
return -1; return -1;
} }
cudnnStatus_t err = cudnnSetFilter4dDescriptor( int dim = CudaNdarray_NDIM(var);
desc, CUDNN_DATA_FLOAT, cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var)[0], CudaNdarray_HOST_DIMS(var));
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3]
);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s." "Could not set filter descriptor: %s."
" dims= %d %d %d %d", " dims= %d",
cudnnGetErrorString(err), cudnnGetErrorString(err), dim);
CudaNdarray_HOST_DIMS(var)[0],
CudaNdarray_HOST_DIMS(var)[1],
CudaNdarray_HOST_DIMS(var)[2],
CudaNdarray_HOST_DIMS(var)[3]);
return -1; return -1;
} }
return 0; return 0;
......
...@@ -3,6 +3,24 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input); ...@@ -3,6 +3,24 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output); cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
/* Keep track, from one execution to another, of the dimension of the data
and the algorithms, if any, that were selected according to these dimensions
and according to the amount of memory available at that time.
Note : Implementation selection for backward convolution only exists starting
at V3.
*/
int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5];
bool APPLY_SPECIFIC(previous_algo_set);
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
#endif
#section init_code_struct #section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err); cudnnStatus_t APPLY_SPECIFIC(err);
...@@ -10,21 +28,38 @@ APPLY_SPECIFIC(input) = NULL; ...@@ -10,21 +28,38 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL; APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL; APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err))); cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
for (int i = 0; i < 5; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
APPLY_SPECIFIC(previous_output_shape)[i] = 0;
}
APPLY_SPECIFIC(previous_algo_set) = false;
// Select default implementations for the case where the convolution
// implementations should be selected based on the size of the data.
APPLY_SPECIFIC(previous_algo) = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
APPLY_SPECIFIC(previous_bwd_f_algo) = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
APPLY_SPECIFIC(previous_bwd_d_algo) = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#endif
#section cleanup_code_struct #section cleanup_code_struct
if (APPLY_SPECIFIC(input) != NULL) if (APPLY_SPECIFIC(input) != NULL)
......
...@@ -12,25 +12,225 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -12,25 +12,225 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
return 1; return 1;
} }
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filterNd(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
int nb_dim = CudaNdarray_NDIM(output);
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
Py_XDECREF(*input); Py_XDECREF(*input);
*input = im; *input = im;
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0) if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 3000
{
size_t worksize;
void *workspace;
cudnnConvolutionBwdDataAlgo_t chosen_algo;
if (CHOOSE_ALGO)
{
// A new convolution implementation should be selected, based either on
// timing or heuristics, if in one of the two following cases :
// - The implementation should only be chosen during the first execution
// of an apply node and this is the first execution of the apply node.
// - The implementation should be chosen as often as necessary and the
// shapes of the inputs differ from the last time an implementation
// was chosen.
bool reuse_previous_algo;
if (CHOOSE_ALGO_ONCE)
{
// Only choose a new implementation of none has been chosen before.
reuse_previous_algo = APPLY_SPECIFIC(previous_algo_set);
}
else
{
// Reuse the previous implementation if the the kernels and the outputs
// have the same shapes as they had when the previous implementation
// was selected
bool same_shapes = true;
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(kerns)[i] ==
APPLY_SPECIFIC(previous_kerns_shape)[i]);
same_shapes &= (CudaNdarray_HOST_DIMS(output)[i] ==
APPLY_SPECIFIC(previous_output_shape)[i]);
}
reuse_previous_algo = same_shapes;
}
// If the previously choosen implementation can't be reused, select a
// new one based on the shapes of the current inputs
if (!reuse_previous_algo)
{
// Obtain a convolution algorithm appropriate for the kernel and output
// shapes. Either by choosing one according to heuristics or by making
// CuDNN time every implementation and choose the best one.
if (CHOOSE_ALGO_TIME)
{
// Time the different implementations to choose the best one
int requestedCount = 1;
int count;
cudnnConvolutionBwdDataAlgoPerf_t choosen_algo_perf;
err = cudnnFindConvolutionBackwardDataAlgorithm(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
requestedCount,
&count,
&choosen_algo_perf);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error selecting convolution algo: "
"%s", cudnnGetErrorString(err));
return 1;
}
chosen_algo = choosen_algo_perf.algo;
}
else
{
// Choose the convolution implementation using heuristics based on the
// shapes of the inputs and the amount of memory available.
// Get the amount of available memory
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess){
cudaGetLastError();
fprintf(stderr,
"Error when trying to find the memory information"
" on the GPU: %s\n", cudaGetErrorString(err2));
return 1;
}
// Use heuristics to choose the implementation
err = cudnnGetConvolutionBackwardDataAlgorithm(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
free,
&chosen_algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
}
// Store the shapes of the kernels and output as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_bwd_d_algo) = chosen_algo;
for (int i = 0; i < nb_dim; i++)
{
APPLY_SPECIFIC(previous_kerns_shape)[i] =
CudaNdarray_HOST_DIMS(kerns)[i];
APPLY_SPECIFIC(previous_output_shape)[i] =
CudaNdarray_HOST_DIMS(output)[i];
}
}
else
{
// Reuse the previously chosen convlution implementation
chosen_algo = APPLY_SPECIFIC(previous_bwd_d_algo);
}
}
else
{
chosen_algo = CONV_ALGO;
}
// The FFT implementation (only in v3 and onward) does not support strides,
// 1x1 filters or inputs with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// can't.
if (chosen_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
int pad_h, pad_w, stride_v, stride_h, upscale_x, upscale_y;
cudnnConvolutionMode_t mode;
err = cudnnGetConvolution2dDescriptor(desc, &pad_h, &pad_w,
&stride_v, &stride_h,
&upscale_x, &upscale_y,
&mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
// Extract the spatial size of the filters
int filter_h = CudaNdarray_HOST_DIMS(kerns)[3];
int filter_w = CudaNdarray_HOST_DIMS(kerns)[4];
// Extract the spatial size of the input
int input_h = CudaNdarray_HOST_DIMS(*input)[3];
int input_w = CudaNdarray_HOST_DIMS(*input)[4];
// Ensure that the selected implementation supports the requested
// convolution. Fall back to a safe implementation otherwise.
if (stride_v != 1 || stride_h != 1 || input_h > 1024 ||
input_w > 1024 || (filter_h == 1 && filter_w == 1))
{
chosen_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
// Infer required workspace size from the chosen implementation
err = cudnnGetConvolutionBackwardDataWorkspaceSize(_handle,
APPLY_SPECIFIC(kerns),
APPLY_SPECIFIC(output),
desc,
APPLY_SPECIFIC(input),
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConvGradI: error getting worksize: %s",
cudnnGetErrorString(err));
return 1;
}
// Allocate workspace for the convolution
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
// Perform the convolution
err = cudnnConvolutionBackwardData_v3(
_handle,
(void *)&alpha,
APPLY_SPECIFIC(kerns), CudaNdarray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(output),
desc,
chosen_algo,
workspace, worksize,
(void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
}
#else
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
_handle, _handle,
(void *)&alpha, (void *)&alpha,
...@@ -39,6 +239,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -39,6 +239,8 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
desc, desc,
(void *)&beta, (void *)&beta,
APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input)); APPLY_SPECIFIC(input), CudaNdarray_DEV_DATA(*input));
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConvGradI: error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
......
...@@ -452,7 +452,8 @@ def test_default_conv(): ...@@ -452,7 +452,8 @@ def test_default_conv():
for a in f.maker.fgraph.apply_nodes]) for a in f.maker.fgraph.apply_nodes])
def _test_full(cls, mode=None, version=[-1], extra_shapes=[]): def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
test_bigger_kernels=True):
seed_rng() seed_rng()
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes += get_shapes2() shapes += get_shapes2()
...@@ -481,14 +482,18 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]): ...@@ -481,14 +482,18 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one , ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1 , ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights , ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
]
# other test if test_bigger_kernels:
, ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)) # kernel bigger then image # Shapes where the kernel is larger than the image in some dimension
shapes += [
((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1))
, ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)) , ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1))
, ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)) , ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1))
, ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)) # kernel bigger then image , ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1))
, ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)) , ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
] ]
shapes += [ shapes += [
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers # ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers # , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers
...@@ -516,9 +521,16 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]): ...@@ -516,9 +521,16 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[]):
def test_full(): def test_full():
for t in _test_full(None,
mode=theano_mode, # If using CuDNN version before v3, only run the tests where the
version=[-1]): # kernels are not larger than the input in any spatial dimension.
if cuda.dnn.dnn_available() and cuda.dnn.version() < (3000, 3000):
test_bigger_kernels = False
else:
test_bigger_kernels = True
for t in _test_full(None, mode=theano_mode, version=[-1],
test_bigger_kernels=test_bigger_kernels):
yield t yield t
...@@ -531,7 +543,16 @@ def test_gemm_full(): ...@@ -531,7 +543,16 @@ def test_gemm_full():
def test_dnn_full(): def test_dnn_full():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
for t in _test_full(DnnBase, mode=theano_mode.including("cudnn")):
# If using CuDNN version before v3, only run the tests where the
# kernels are not larger than the input in any spatial dimension.
if cuda.dnn.version() < (3000, 3000):
test_bigger_kernels = False
else:
test_bigger_kernels = True
for t in _test_full(DnnBase, mode=theano_mode.including("cudnn"),
test_bigger_kernels=test_bigger_kernels):
yield t yield t
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论