提交 b998dc61 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6302 from borisfom/tensor_op

Tensor op, cache
#section support_code_apply #section support_code_apply
static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc, cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) { PARAMS_TYPE* params) {
...@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp, ...@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
"descriptor: %s", cudnnGetErrorString(err)); "descriptor: %s", cudnnGetErrorString(err));
return -1; return -1;
} }
if (c_set_groups_for_conv(*desc, params->num_groups) == -1)
return -1;
return 0; return 0;
} }
...@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() { ...@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() {
} }
#endif #endif
#if CUDNN_MAJOR < 7
enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
#endif
/* a common struct for all 3 CUDNN enums */
struct AlgoRec {
int algo;
size_t wsSize;
cudnnMathType_t mathType;
};
#endif #endif
...@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input); ...@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output); cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
static int c_get_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
int desc_groups;
if (groups > 1) {
cudnnStatus_t err = cudnnGetConvolutionGroupCount(desc, &desc_groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
if (groups != desc_groups) {
PyErr_SetString(PyExc_MemoryError,
"groups specified different from convolution descriptor");
return -1;
}
}
return 1;
#else
return groups;
#endif
}
static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
#section init_code_struct #section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err); cudnnStatus_t APPLY_SPECIFIC(err);
...@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) ...@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s", PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err))); cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
...@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL) ...@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output)); cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL) if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns)); cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code
#include <sstream>
#include <string>
#if __cplusplus < 201103L
#include <tr1/unordered_map>
typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
#else
#include <unordered_map>
typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif
#include "pthread.h"
#line 87 "dnn_conv_base.c"
pthread_mutex_t algoMutex;
AlgoCache algoCache;
static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
{
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
msg, cudnnGetErrorString(err));
}
return err;
}
static size_t
c_get_largest_free_block_size(PyGpuContextObject *c)
{
size_t maxfree = 0;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
}
// Guess 4Mb if the info is not available
if (maxfree == 0) maxfree = 4 * 1024 * 1024;
return maxfree;
}
/** Check if convolution output tensor has expected dimensions
depending on given inputs and number of groups.
return 0 if everything is ok, non-0 on error.
**/
static int dnn_check_convolution_output(cudnnConvolutionDescriptor_t convDesc,
cudnnTensorDescriptor_t inputDesc,
cudnnFilterDescriptor_t filterDesc,
size_t tensorNdim,
PyGpuArrayObject* output,
int groups) {
int expected_output_dims[5] = {0};
cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
tensorNdim, expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
return 1;
}
if (tensorNdim == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
" but received %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1] * groups,
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
return 1;
}
} else if (tensorNdim == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1] * groups,
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
return 1;
}
}
return 0;
}
static std::string shape(int* res, int size)
{
std::ostringstream s;
if (size > 0) {
s << res[0];
for (int i = 1; i < size; ++i)
s <<',' << res[i];
}
return s.str();
}
static std::string shape(cudnnTensorDescriptor_t t)
{
// cuDNN can handle up to CUDNN_DIM_MAX dimensions.
int res[CUDNN_DIM_MAX];
int stride[CUDNN_DIM_MAX];
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
"error getting tensor description");
if (PyErr_Occurred()) return "";
return shape(res, nbDims) + "," + shape(stride, nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int res[CUDNN_DIM_MAX];
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
"error getting filter description");
if (PyErr_Occurred()) return "";
return shape(res, outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{
int nDim;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[5];
int strideA[5];
int dilationA[5];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, 5,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ),
"error getting convolution description");
if (PyErr_Occurred()) return "";
return (std::string("-mode ") +
((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
" -pad " +
shape(padA, nDim) +
" -subsample " +
shape(strideA, nDim) +
" -dilation " +
shape(dilationA, nDim));
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
{
size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
// there have to be entries for both aligned and not
if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
{
return false;
}
return true;
}
static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
cudnnConvolutionDescriptor_t convDesc,
PyGpuArrayObject* output, int groups)
{
cudnnDataType_t dType;
std::ostringstream s;
int expected_output_dims[5] = {0};
if (dnn_check_convolution_output(convDesc, inputDesc, filterDesc, PyGpuArray_NDIM(filter), output, groups) != 0)
return "";
std::string shapeInput = shape(inputDesc);
std::string shapeFilter = shape(filterDesc, &dType);
std::string shapeConvDesc = shape(convDesc);
if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
return "";
s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
shapeFilter << " " << shapeConvDesc;
// there have to be entries for both aligned and not.
if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
{
s << " [unaligned]";
}
return s.str();
}
static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{
pthread_mutex_lock(&algoMutex);
algoCache[hash] = rec;
pthread_mutex_unlock(&algoMutex);
}
static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
pthread_mutex_lock(&algoMutex);
const AlgoRec* ret = 0;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
ret = &hit->second;
pthread_mutex_unlock(&algoMutex);
return ret;
}
#section init_code_struct #section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
prev_algo = PARAMS->conv_algo; hash_prefix = std::string("FWD|GPU#");
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#section support_code_struct #section support_code_struct
#line 9 "dnn_fwd.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_fwd_fallback(cudnnConvolutionFwdAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionFwdAlgo_t algo = *_algo;
int reuse_algo; /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
cudnnConvolutionFwdAlgo_t prev_algo; if (PyGpuArray_NDIM(input) == 5 &&
size_t prev_img_dims[5]; !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM ||
size_t prev_kern_dims[5]; algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536)
{
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#ifdef DEBUG
fprintf(stderr, "(replacing fwd algo fft with none)\n");
#endif
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#ifdef DEBUG
fprintf(stderr, "(replacing fwd algo fft_tiling with none)\n");
#endif
}
}
}
*_algo = algo;
return 0;
}
int int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...@@ -24,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -24,6 +117,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
void *beta_p; void *beta_p;
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -73,65 +167,76 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -73,65 +167,76 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 0; return 0;
} }
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) int groups = c_get_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1;
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
#ifdef DEBUG size_t worksize = 0;
char algorithm_name[128]; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
#endif
std::string hashkey;
cuda_enter(c->ctx); cuda_enter(c->ctx);
size_t maxfree = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
if (params->choose_algo) { if (params->choose_algo) {
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
}
if (!reuse_algo) { if (!reuse_algo) {
size_t free; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); // check out cache
if (err2 != GA_NO_ERROR) { hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " if (hashkey.empty()) {
"memory information on the GPU");
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
// Guess 4Mb if the info is not available if (reuse_algo || use_cached) {
if (free == 0) free = 4 * 1024 * 1024; algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) { if (params->choose_time) {
int count; int count;
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate GPU memory for FindEx");
cuda_exit(c->ctx);
return -1; return -1;
} }
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
// We don't sync the buffer as we don't care about the values. // We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output),
1, &count, &choice, *(void **)tmpmem, 1, &count, &choice, *(void **)tmpmem,
free); maxfree);
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -141,138 +246,56 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -141,138 +246,56 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
algo = choice.algo;
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found"); PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found");
cuda_exit(c->ctx);
return 1; return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) { } else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting best-timed FWD algo: %s", "error getting best-timed FWD algo: %s",
cudnnGetErrorString(choice.status)); cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1; return 1;
} // Else, count is necessarly 1 for current implementation. } // Else, count is necessarly 1 for current implementation.
#endif #endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
} else { } else {
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
} prev_algo.algo = algo;
prev_algo = algo; // no tensor_op returned from Get()
} else { prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
algo = prev_algo;
}
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
} }
} }
} }
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
if (PyGpuArray_NDIM(input) == 5 && dnn_conv_fwd_fallback(&algo, input, kerns, desc) != 0) {
!(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || cuda_exit(c->ctx);
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || return 1;
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING))
{
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1.
// Issue should be resolved for cuDNN > V6.0.
// NB: In cuDNN V7, issue is resolved for 2D convolutionss only.
if ((cudnnGetVersion() < 6100 || PyGpuArray_NDIM(input) == 5) &&
algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM &&
PyGpuArray_DIM(input, 0) > 65536)
{
#ifdef DEBUG
fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM "
"will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n");
#endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
/* NB:
TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1.
New conditions apply and may depend on number of dimensions (2D or 3D)
e.g. for FFT_TILING.
TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms?
*/
if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int dilation[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
dilation, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
} else {
// algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
}
} }
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{ {
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
...@@ -280,19 +303,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -280,19 +303,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
algo, algo,
&worksize); &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) { if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported // Fallback to none algo if not supported
#ifdef DEBUG #ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1; return 1;
fprintf(stderr, "(%s error getting worksize: " }
"fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n",
algorithm_name);
#endif #endif
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
...@@ -303,13 +324,47 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -303,13 +324,47 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
"error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
}
if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
// algo may have changed due to fallback, we must update it.
prev_algo.algo = algo;
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache if we choose on shape change, or first time if we choose once.
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
fprintf(stderr, "(using %s%s %s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
mathtype == CUDNN_TENSOR_OP_MATH ? "[T]" : "",
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize,
hashkey.c_str()
);
}
#endif
if (params->choose_once) {
reuse_algo = 1;
}
{
gpudata *workspace = 0;
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
...@@ -318,8 +373,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -318,8 +373,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
"Could not allocate working memory");
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -329,16 +383,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -329,16 +383,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) { for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
params->handle, params->handle,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g,
APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g,
desc, algo, desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g);
} }
if (worksize != 0) if (worksize != 0)
...@@ -348,12 +402,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -348,12 +402,15 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
} }
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv FWD operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
return 0; return 0;
} }
#section init_code_struct #section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
prev_algo = PARAMS->conv_algo; hash_prefix = std::string("GI|GPU#");
memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#section support_code_struct #section support_code_struct
#line 9 "dnn_gi.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gi_fallback(cudnnConvolutionBwdDataAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdDataAlgo_t algo = *_algo;
int reuse_algo; // The FFT implementation does not support strides, 1x1 filters or inputs
cudnnConvolutionBwdDataAlgo_t prev_algo; // with a spatial dimension larger than 1024. The tiled-FFT implementation
size_t prev_kern_dims[5]; // does not support strides.
size_t prev_top_dims[5]; // If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) {
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1))
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradinput algo fft with none)\n");
#endif
}
} else {
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1) {
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradinput algo fft_tiling with none)\n");
#endif
}
}
}
*_algo = algo;
return 0;
}
int int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
...@@ -23,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -23,6 +82,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
void *beta_p; void *beta_p;
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " PyErr_SetString(PyExc_ValueError, "images and kernel must have the same "
...@@ -72,233 +132,200 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -72,233 +132,200 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1) int groups = c_get_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
return 1; return 1;
if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups; if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), groups) == -1)
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
#ifdef DEBUG
char algorithm_name[128];
#endif
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(im), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
}
if (PyGpuArray_NDIM(im) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(im) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (params->choose_algo) { if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
if (!params->choose_once) { PyGpuArray_NDIM(kerns), output, groups))
reuse_algo = 1; return 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) { size_t input_offset = PyGpuArray_STRIDE(*input, 0) / groups;
size_t free; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
if (err2 != GA_NO_ERROR) { cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " size_t worksize = 0;
"memory information on the GPU"); cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
cuda_exit(c->ctx);
return 1;
}
// Guess 4Mb if the info is not available std::string hashkey;
if (free == 0) free = 4 * 1024 * 1024;
if (params->choose_time) { size_t maxfree = c_get_largest_free_block_size(c);
int count; if (PyErr_Occurred()) return 1;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); cuda_enter(c->ctx);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx( if (params->choose_algo) {
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, free);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (!reuse_algo) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", char pci_id[16];
cudnnGetErrorString(err)); gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache
hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty()) {
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
algo = choice.algo; if (reuse_algo || use_cached) {
algo = (cudnnConvolutionBwdDataAlgo_t)prev_algo.algo;
#ifdef DEBUG worksize = prev_algo.wsSize;
if (count == 0) { mathtype = prev_algo.mathType;
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found"); } else {
if (params->choose_time) {
int count;
cudnnConvolutionBwdDataAlgoPerf_t choice;
gpudata *tmpmem;
// set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1;
}
err = cudnnFindConvolutionBackwardDataAlgorithmEx(
params->handle, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
1, &count, &choice, *(void **)tmpmem, maxfree);
gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) { }
PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradinput algo: %s", #ifdef DEBUG
cudnnGetErrorString(choice.status)); if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
cuda_exit(c->ctx);
return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting best-timed gradinput algo: %s",
cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1;
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1; return 1;
} // Else, count is necessarly 1 for current implementation. }
#endif prev_algo.algo = algo;
// no tensor_op returned from Get()
} else { prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(input),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} }
} }
prev_algo = algo; }
} else {
algo = prev_algo; if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
} dnn_conv_gi_fallback(&algo, *input, kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
#ifdef DEBUG // if FindEx was used (choose_time), workspace size is set.
char algorithm_name[128]; if (!(reuse_algo || use_cached || params->choose_time))
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1; return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
fprintf(stderr, "(error getting worksize for %s: failing back to CUDNN_CONVOLUTION_BWD_DATA_ALGO_0)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize);
} }
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation
// does not support strides.
// If the chosen implementation is FFT or tiled-FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
"error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
} // !(reuse_algo || use_cached || params->choose_time)
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
{ // algo may have changed due to fallback, we must update it.
if (stride[0] != 1 || stride[1] != 1 || prev_algo.algo = algo;
PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 || // save worksize for next time/cache
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) prev_algo.wsSize = worksize;
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
else
{
// algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
if (stride[0] != 1 || stride[1] != 1)
{
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
}
}
}
size_t worksize; // Add to the cache
gpudata *workspace; dnn_conv_update_cache(hashkey, prev_algo);
}
err = cudnnGetConvolutionBackwardDataWorkspaceSize( #ifdef DEBUG
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, if (params->choose_algo) {
APPLY_SPECIFIC(input), algo, &worksize); if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
);
}
#endif
if (err != CUDNN_STATUS_SUCCESS) { if (params->choose_once) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", reuse_algo = 1;
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
} }
gpudata *workspace = 0;
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory");
"Could not allocate working memory");
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
...@@ -308,8 +335,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -308,8 +335,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) for ( int g = 0; g < groups; g++) {
{
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
params->handle, params->handle,
alpha_p, alpha_p,
...@@ -330,7 +356,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -330,7 +356,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradinput operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
#section init_code_struct #section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
reuse_algo = 0; reuse_algo = 0;
prev_algo = PARAMS->conv_algo; hash_prefix = std::string("GW|GPU#");
memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims));
#section support_code_struct #section support_code_struct
#line 9 "dnn_gw.c"
int reuse_algo;
AlgoRec prev_algo;
std::string hash_prefix;
#ifdef DEBUG
char algorithm_name[128];
#endif
/** Check given algorithm against inputs and convolution descriptor,
change algorithm inplace to a fallback algorithm if checkings fail.
Return 0 on success, non-0 on error. **/
int dnn_conv_gw_fallback(cudnnConvolutionBwdFilterAlgo_t* _algo,
const PyGpuArrayObject* input,
const PyGpuArrayObject* kerns,
cudnnConvolutionDescriptor_t desc) {
cudnnConvolutionBwdFilterAlgo_t algo = *_algo;
int reuse_algo; // The FFT implementation does not support strides, 1x1 filters or inputs
cudnnConvolutionBwdFilterAlgo_t prev_algo; // with a spatial dimension larger than 1024.
size_t prev_img_dims[5]; // If the chosen implementation is FFT, validate that it can
size_t prev_top_dims[5]; // be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
cudnnStatus_t err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
#ifdef DEBUG
fprintf(stderr, "(replacing gradweight algo fft with none)\n");
#endif
}
}
*_algo = algo;
return 0;
}
int int
APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
...@@ -23,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -23,6 +69,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
void *beta_p; void *beta_p;
float af = alpha, bf = beta; float af = alpha, bf = beta;
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
bool use_cached = 0;
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1] * params->num_groups) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -72,99 +119,71 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -72,99 +119,71 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) int groups = c_get_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1; return 1;
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
return 1; return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; if (0 != dnn_check_convolution_output(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups; PyGpuArray_NDIM(*kerns), output, groups))
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG size_t worksize = 0;
char algorithm_name[128]; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
#endif
cuda_enter(c->ctx); std::string hashkey ;
int expected_output_dims[5] = {0}; size_t maxfree = c_get_largest_free_block_size(c);
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), if (PyErr_Occurred()) return 1;
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) { cuda_enter(c->ctx);
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld"
" but received gradient with shape %ldx%ldx%dx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (params->choose_algo) { if (params->choose_algo) {
if (!params->choose_once) {
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]);
}
}
if (!reuse_algo) { if (!reuse_algo) {
size_t free; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); // check out cache
if (err2 != GA_NO_ERROR) { hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " if (hashkey.empty()) {
"memory information on the GPU");
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
hashkey = hash_prefix + pci_id + (params->choose_time ? " -t " : " ") + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
// Guess 4Mb if the info is not available if (reuse_algo || use_cached) {
if (free == 0) free = 4 * 1024 * 1024; algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} else {
if (params->choose_time) { if (params->choose_time) {
int count; int count;
cudnnConvolutionBwdFilterAlgoPerf_t choice; cudnnConvolutionBwdFilterAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); // set the 'tensor math ok' flag
c_set_math_type_for_conv(desc, CUDNN_TENSOR_OP_MATH);
tmpmem = gpudata_alloc(c->ctx, maxfree, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
cuda_exit(c->ctx);
return -1; return -1;
} }
...@@ -172,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -172,7 +191,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc,
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
1, &count, &choice, *(void **)tmpmem, free); 1, &count, &choice, *(void **)tmpmem, maxfree);
gpudata_release(tmpmem); gpudata_release(tmpmem);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -183,25 +202,32 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -183,25 +202,32 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
algo = choice.algo;
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found"); PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
cuda_exit(c->ctx);
return 1; return 1;
} else if (choice.status != CUDNN_STATUS_SUCCESS) { } else if (choice.status != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting best-timed gradweight algo: %s", "error getting best-timed gradweight algo: %s",
cudnnGetErrorString(choice.status)); cudnnGetErrorString(choice.status));
cuda_exit(c->ctx);
return 1; return 1;
} // Else, count is necessarly 1 for current implementation. } // Else, count is necessarly 1 for current implementation.
#endif #endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
} else { } else {
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
desc, APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(kerns),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, maxfree, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -209,79 +235,84 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -209,79 +235,84 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
} }
prev_algo = algo;
} else {
algo = prev_algo;
} }
} /* choose_algo */
if (c_set_math_type_for_conv(desc, mathtype) == -1 ||
dnn_conv_gw_fallback(&algo, input, *kerns, desc) != 0) {
cuda_exit(c->ctx);
return 1;
}
#ifdef DEBUG // if FindEx was used (choose_time), workspace size is set.
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx);
return 1; return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
} }
fprintf(stderr, "(error getting worksize for %s: falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0)\n",
algorithm_name);
#endif
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize);
} }
}
// The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can
// be used on the current data and default to a safe implementation if it
// can't.
// The following code is 2d-specific but it is fine as FFT and tiled-FFT are
// defined only for 2d filters
if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT &&
PyGpuArray_NDIM(input) == 4) {
// Extract the properties of the convolution descriptor
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
cudnnDataType_t data_type;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode, &data_type);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
"error getting convolution properties: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 ||
(PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) {
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
}
} }
size_t worksize; if (params->choose_algo && (!params->choose_once || !reuse_algo)) {
gpudata *workspace; // algo may have changed due to fallback, we must update it.
prev_algo.algo = algo;
// save worksize for next time/cache
prev_algo.wsSize = worksize;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( // Add to the cache
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, dnn_conv_update_cache(hashkey, prev_algo);
APPLY_SPECIFIC(kerns), algo, &worksize); }
if (err != CUDNN_STATUS_SUCCESS) { #ifdef DEBUG
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", if (params->choose_algo) {
cudnnGetErrorString(err)); if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) {
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
}
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
);
}
#endif
if (params->choose_once) {
reuse_algo = 1;
} }
gpudata *workspace = 0;
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
...@@ -295,9 +326,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -295,9 +326,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) for ( int g = 0; g < groups; g++) {
{
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
params->handle, params->handle,
alpha_p, alpha_p,
...@@ -318,7 +347,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -318,7 +347,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing cuDNN conv gradweight operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
...@@ -399,7 +399,7 @@ class DnnBase(COp): ...@@ -399,7 +399,7 @@ class DnnBase(COp):
return [] return []
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 1) return (super(DnnBase, self).c_code_cache_version(), version(), 4)
class GpuDnnConvDesc(COp): class GpuDnnConvDesc(COp):
...@@ -412,7 +412,8 @@ class GpuDnnConvDesc(COp): ...@@ -412,7 +412,8 @@ class GpuDnnConvDesc(COp):
""" """
__props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode', 'precision') __props__ = ('border_mode', 'subsample', 'dilation', 'conv_mode',
'precision', 'num_groups')
params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t, params_type = ParamsType(pad0=int_t, pad1=int_t, pad2=int_t,
sub0=int_t, sub1=int_t, sub2=int_t, sub0=int_t, sub1=int_t, sub2=int_t,
dil0=int_t, dil1=int_t, dil2=int_t, dil0=int_t, dil1=int_t, dil2=int_t,
...@@ -421,7 +422,8 @@ class GpuDnnConvDesc(COp): ...@@ -421,7 +422,8 @@ class GpuDnnConvDesc(COp):
('BORDER_MODE_VALID', 'valid'), ('BORDER_MODE_VALID', 'valid'),
('BORDER_MODE_HALF', 'half')), ('BORDER_MODE_HALF', 'half')),
conv_mode=cudnn.cudnnConvolutionMode_t, conv_mode=cudnn.cudnnConvolutionMode_t,
precision=cudnn.cudnnDataType_t) precision=cudnn.cudnnDataType_t,
num_groups=int_t)
def c_headers(self): def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h'] return ['cudnn.h', 'cudnn_helper.h']
...@@ -448,7 +450,7 @@ class GpuDnnConvDesc(COp): ...@@ -448,7 +450,7 @@ class GpuDnnConvDesc(COp):
return False return False
def __init__(self, border_mode, subsample=(1, 1), dilation=(1, 1), conv_mode='conv', def __init__(self, border_mode, subsample=(1, 1), dilation=(1, 1), conv_mode='conv',
precision="float32"): precision="float32", num_groups=1):
COp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)") COp.__init__(self, ["c_code/conv_desc.c"], "APPLY_SPECIFIC(conv_desc)")
if version() < 6000 and any([d != 1 for d in dilation]): if version() < 6000 and any([d != 1 for d in dilation]):
...@@ -470,6 +472,7 @@ class GpuDnnConvDesc(COp): ...@@ -470,6 +472,7 @@ class GpuDnnConvDesc(COp):
self.subsample = subsample self.subsample = subsample
assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode) assert cudnn.cudnnConvolutionMode_t.has_alias(conv_mode)
self.conv_mode = conv_mode self.conv_mode = conv_mode
self.num_groups = num_groups
assert len(dilation) == len(subsample) assert len(dilation) == len(subsample)
self.dilation = dilation self.dilation = dilation
...@@ -514,6 +517,8 @@ class GpuDnnConvDesc(COp): ...@@ -514,6 +517,8 @@ class GpuDnnConvDesc(COp):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self, "dilation"): if not hasattr(self, "dilation"):
self.dilation = (1,) * len(self.subsample) self.dilation = (1,) * len(self.subsample)
if not hasattr(self, "num_groups"):
self.num_groups = 1
# scalar constants # scalar constants
...@@ -622,8 +627,6 @@ class GpuDnnConv(DnnBase): ...@@ -622,8 +627,6 @@ class GpuDnnConv(DnnBase):
SUPPORTED_DNN_CONV_ALGO_RUNTIME): SUPPORTED_DNN_CONV_ALGO_RUNTIME):
raise ValueError("convolution algo %s can't be used for " raise ValueError("convolution algo %s can't be used for "
"3d convolutions", (self.algo,)) "3d convolutions", (self.algo,))
if img.type.ndim == 5 and self.num_groups != 1:
raise ValueError("Grouped convolutions not implemented for 3D convolutions")
if (not isinstance(desc.type, CDataType) or if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'): desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
...@@ -854,7 +857,6 @@ class GpuDnnConvGradI(DnnBase): ...@@ -854,7 +857,6 @@ class GpuDnnConvGradI(DnnBase):
if algo is None: if algo is None:
algo = config.dnn.conv.algo_bwd_data algo = config.dnn.conv.algo_bwd_data
self.algo = algo self.algo = algo
assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME assert cudnn.cudnnConvolutionBwdDataAlgo_t.has_alias(self.algo) or self.algo in SUPPORTED_DNN_CONV_ALGO_RUNTIME
self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 self.conv_algo = cudnn.cudnnConvolutionBwdDataAlgo_t.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0
...@@ -1039,7 +1041,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1), ...@@ -1039,7 +1041,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision,
num_groups=num_groups)(kerns.shape)
desc_op = desc.owner.op desc_op = desc.owner.op
# We can use Shape_i and bypass the infer_shape here as this is on # We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present. # the input of node and it will always be present.
...@@ -1056,7 +1059,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1), ...@@ -1056,7 +1059,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), dilation=(1, 1),
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1), def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1, 1, 1),
conv_mode='conv', direction_hint=None, conv_mode='conv', direction_hint=None,
algo=None, precision=None): algo=None, precision=None, num_groups=1):
""" """
GPU convolution using cuDNN from NVIDIA. GPU convolution using cuDNN from NVIDIA.
...@@ -1099,6 +1102,9 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1 ...@@ -1099,6 +1102,9 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
should be done. Possible values are 'as_input', 'float16', 'float32' should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`. :attr:`config.dnn.conv.precision`.
num_groups :
Divides the image, kernel and output tensors into num_groups
separate groups. Each which carry out convolutions separately
.. warning:: The cuDNN library only works with GPUs that have a compute .. warning:: The cuDNN library only works with GPUs that have a compute
...@@ -1113,7 +1119,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1 ...@@ -1113,7 +1119,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None) fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
ctx_name = infer_context_name(img, kerns) ctx_name = infer_context_name(img, kerns)
if (border_mode == 'valid' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and if (border_mode == 'valid' and subsample == (1, 1, 1) and dilation == (1, 1, 1) and
direction_hint == 'bprop weights'): direction_hint == 'bprop weights' and num_groups == 1):
# Special case: We are asked to use GpuDnnConvGradW. We need to set # Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for. # up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4)) img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
...@@ -1135,7 +1141,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1 ...@@ -1135,7 +1141,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1, 1) and elif (border_mode == 'full' and subsample == (1, 1, 1) and
direction_hint != 'forward!'): direction_hint != 'forward!' and num_groups == 1):
# Special case: We can be faster by using GpuDnnConvGradI to compute # Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution. # the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution. # We just need to set up a suitable 'fake' valid convolution.
...@@ -1159,7 +1165,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1 ...@@ -1159,7 +1165,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision,
num_groups=num_groups)(kerns.shape)
desc_op = desc.owner.op desc_op = desc.owner.op
# We can use Shape_i and bypass the infer_shape here as this is on # We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present. # the input of node and it will always be present.
...@@ -1171,7 +1178,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1 ...@@ -1171,7 +1178,7 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), dilation=(1
filter_dilation=dilation) filter_dilation=dilation)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return GpuDnnConv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo, num_groups=num_groups)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
...@@ -1189,18 +1196,21 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', ...@@ -1189,18 +1196,21 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
precision = get_precision(precision, [img, topgrad]) precision = get_precision(precision, [img, topgrad])
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns_shp) conv_mode=conv_mode, precision=precision,
num_groups=num_groups)(kerns_shp)
out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
return GpuDnnConvGradW(algo=algo, num_groups=num_groups)(img, topgrad, out, desc) return GpuDnnConvGradW(algo=algo, num_groups=num_groups)(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', precision=None): subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv',
precision=None, algo=None, num_groups=1):
""" """
3d version of dnn_gradweight 3d version of dnn_gradweight
""" """
return dnn_gradweight(img, topgrad, kerns_shp, border_mode, return dnn_gradweight(img, topgrad, kerns_shp, border_mode,
subsample, dilation, conv_mode, precision) subsample, dilation, conv_mode, precision,
algo, num_groups)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
...@@ -1218,18 +1228,21 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', ...@@ -1218,18 +1228,21 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
precision = get_precision(precision, [kerns, topgrad]) precision = get_precision(precision, [kerns, topgrad])
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample, dilation=dilation,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision,
num_groups=num_groups)(kerns.shape)
out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp) out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
return GpuDnnConvGradI(algo=algo, num_groups=num_groups)(kerns, topgrad, out, desc) return GpuDnnConvGradI(algo=algo, num_groups=num_groups)(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid', def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv', precision=None): subsample=(1, 1, 1), dilation=(1, 1, 1), conv_mode='conv',
precision=None, algo=None, num_groups=1):
""" """
3d version of `dnn_gradinput`. 3d version of `dnn_gradinput`.
""" """
return dnn_gradinput(kerns, topgrad, img_shp, border_mode, subsample, return dnn_gradinput(kerns, topgrad, img_shp, border_mode, subsample,
dilation, conv_mode, precision) dilation, conv_mode, precision, algo,
num_groups)
class GpuDnnPoolDesc(Op): class GpuDnnPoolDesc(Op):
...@@ -3020,8 +3033,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3020,8 +3033,6 @@ def local_abstractconv_cudnn_graph(op, context_name, inputs, outputs):
if version(raises=False) < 6000 and op.filter_dilation != (1, 1): if version(raises=False) < 6000 and op.filter_dilation != (1, 1):
return None return None
if op.num_groups > 1:
return None
inp1 = inputs[0] inp1 = inputs[0]
inp2 = inputs[1] inp2 = inputs[1]
...@@ -3071,8 +3082,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3071,8 +3082,6 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
if version(raises=False) < 6000 and op.filter_dilation != (1, 1, 1): if version(raises=False) < 6000 and op.filter_dilation != (1, 1, 1):
return None return None
if op.num_groups > 1:
return None
inp1 = inputs[0] inp1 = inputs[0]
inp2 = inputs[1] inp2 = inputs[1]
...@@ -3091,7 +3100,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3091,7 +3100,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
subsample=op.subsample, subsample=op.subsample,
dilation=op.filter_dilation, dilation=op.filter_dilation,
direction_hint='forward!', direction_hint='forward!',
conv_mode=conv_mode) conv_mode=conv_mode,
num_groups=op.num_groups)
elif isinstance(op, AbstractConv3d_gradWeights): elif isinstance(op, AbstractConv3d_gradWeights):
shape = (inp2.shape[1], inp1.shape[1], shape = (inp2.shape[1], inp1.shape[1],
inputs[2][0], inputs[2][1], inputs[2][2]) inputs[2][0], inputs[2][1], inputs[2][2])
...@@ -3099,7 +3109,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3099,7 +3109,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode, border_mode=op.border_mode,
subsample=op.subsample, subsample=op.subsample,
dilation=op.filter_dilation, dilation=op.filter_dilation,
conv_mode=conv_mode) conv_mode=conv_mode,
num_groups=op.num_groups)
elif isinstance(op, AbstractConv3d_gradInputs): elif isinstance(op, AbstractConv3d_gradInputs):
shape = (inp2.shape[0], inp1.shape[1], shape = (inp2.shape[0], inp1.shape[1],
inputs[2][0], inputs[2][1], inputs[2][2]) inputs[2][0], inputs[2][1], inputs[2][2])
...@@ -3107,7 +3118,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs): ...@@ -3107,7 +3118,8 @@ def local_abstractconv3d_cudnn_graph(op, context_name, inputs, outputs):
border_mode=op.border_mode, border_mode=op.border_mode,
subsample=op.subsample, subsample=op.subsample,
dilation=op.filter_dilation, dilation=op.filter_dilation,
conv_mode=conv_mode) conv_mode=conv_mode,
num_groups=op.num_groups)
return [rval] return [rval]
......
...@@ -26,6 +26,7 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer ...@@ -26,6 +26,7 @@ from .rnn_support import Model, GRU, LSTM, WrapperLayer
from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD from theano.configdefaults import SUPPORTED_DNN_CONV_ALGO_FWD
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv_noOptim
from theano.tensor.nnet.tests.test_abstract_conv import Grouped_conv3d_noOptim
try: try:
import pygpu import pygpu
...@@ -2264,7 +2265,7 @@ def test_dnn_rnn_lstm_grad_c(): ...@@ -2264,7 +2265,7 @@ def test_dnn_rnn_lstm_grad_c():
utt.assert_allclose(ref_grads_layer[j], g) utt.assert_allclose(ref_grads_layer[j], g)
def dconv2d(border_mode, subsample, filter_dilation, num_groups): def dconvfwd(border_mode, subsample, filter_dilation, num_groups):
def dconv(img, kern): def dconv(img, kern):
return dnn.dnn_conv(img, kern, border_mode=border_mode, subsample=subsample, dilation=filter_dilation, return dnn.dnn_conv(img, kern, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
conv_mode='conv', direction_hint='forward', workmem=None, conv_mode='conv', direction_hint='forward', workmem=None,
...@@ -2272,14 +2273,14 @@ def dconv2d(border_mode, subsample, filter_dilation, num_groups): ...@@ -2272,14 +2273,14 @@ def dconv2d(border_mode, subsample, filter_dilation, num_groups):
return dconv return dconv
def dconv2dw(border_mode, subsample, filter_dilation, num_groups): def dconvgw(border_mode, subsample, filter_dilation, num_groups):
def dconvw(img, topgrad, kshp): def dconvw(img, topgrad, kshp):
return dnn.dnn_gradweight(img, topgrad, kshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation, return dnn.dnn_gradweight(img, topgrad, kshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
conv_mode='conv', precision=None, algo=None, num_groups=num_groups) conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
return dconvw return dconvw
def dconv2di(border_mode, subsample, filter_dilation, num_groups): def dconvgi(border_mode, subsample, filter_dilation, num_groups):
def dconvi(kern, topgrad, imshp): def dconvi(kern, topgrad, imshp):
return dnn.dnn_gradinput(kern, topgrad, imshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation, return dnn.dnn_gradinput(kern, topgrad, imshp, border_mode=border_mode, subsample=subsample, dilation=filter_dilation,
conv_mode='conv', precision=None, algo=None, num_groups=num_groups) conv_mode='conv', precision=None, algo=None, num_groups=num_groups)
...@@ -2288,9 +2289,21 @@ def dconv2di(border_mode, subsample, filter_dilation, num_groups): ...@@ -2288,9 +2289,21 @@ def dconv2di(border_mode, subsample, filter_dilation, num_groups):
class Cudnn_grouped_conv(Grouped_conv_noOptim): class Cudnn_grouped_conv(Grouped_conv_noOptim):
mode = mode_with_gpu mode = mode_with_gpu
conv = staticmethod(dconv2d) conv = staticmethod(dconvfwd)
conv_gradw = staticmethod(dconv2dw) conv_gradw = staticmethod(dconvgw)
conv_gradi = staticmethod(dconv2di) conv_gradi = staticmethod(dconvgi)
conv_op = dnn.GpuDnnConv
conv_gradw_op = dnn.GpuDnnConvGradW
conv_gradi_op = dnn.GpuDnnConvGradI
flip_filter = False
is_dnn = True
class Cudnn_grouped_conv3d(Grouped_conv3d_noOptim):
mode = mode_with_gpu
conv = staticmethod(dconvfwd)
conv_gradw = staticmethod(dconvgw)
conv_gradi = staticmethod(dconvgi)
conv_op = dnn.GpuDnnConv conv_op = dnn.GpuDnnConv
conv_gradw_op = dnn.GpuDnnConvGradW conv_gradw_op = dnn.GpuDnnConvGradW
conv_gradi_op = dnn.GpuDnnConvGradI conv_gradi_op = dnn.GpuDnnConvGradI
...@@ -2519,3 +2532,151 @@ def test_dnn_spatialtf_grad(): ...@@ -2519,3 +2532,151 @@ def test_dnn_spatialtf_grad():
utt.verify_grad(grad_functor, [inputs_val, theta_val], mode=mode_with_gpu, utt.verify_grad(grad_functor, [inputs_val, theta_val], mode=mode_with_gpu,
abs_tol=atol, rel_tol=rtol) abs_tol=atol, rel_tol=rtol)
class TestDnnConv2DRuntimeAlgorithms(object):
ndim = 2
cpu_conv_class = theano.tensor.nnet.corr.CorrMM
runtime_shapes = [
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]),
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]),
(3, [(2, 3, 10, 9), (5, 3, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50), (5, 2, 25, 31)]),
(1, [(1, 1, 100, 200), (1, 1, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20), (2, 2, 20, 19)]), # cache should be used
(1, [(1, 2, 3, 4), (6, 2, 2, 1)])
]
def __init__(self):
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
utt.seed_rng()
self.runtime_algorithms = ('time_once', 'guess_once', 'time_on_shape_change', 'guess_on_shape_change')
def test_fwd_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_fwd_runtime_algorithm(algo):
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
# Scale down the input values to prevent very large absolute errors
# due to float rounding
lower_inputs = inputs / 10
lower_filters = filters / 10
conv = dnn.dnn_conv(img=lower_inputs, kerns=lower_filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
f = theano.function([inputs, filters], conv, mode=mode_with_gpu)
if self.ndim == 3:
flipped_filters = lower_filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = lower_filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(lower_inputs), flipped_filters)
f_ref = theano.function([inputs, filters], conv_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, np.asarray(gpu_res))
for algo in self.runtime_algorithms:
yield (run_fwd_runtime_algorithm, algo)
def test_gradinput_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradinput_runtime_algorithm(algo):
theano.config.dnn.conv.algo_bwd_data = algo
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
grad_i = theano.tensor.grad(conv.sum(), [inputs])
f = theano.function([inputs, filters], grad_i, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradI)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, dnn.GpuDnnConvGradW) for node in f.maker.fgraph.apply_nodes)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_i_ref = theano.tensor.grad(conv_ref.sum(), [inputs])
f_ref = theano.function([inputs, filters], grad_i_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, np.asarray(gpu_res))
for algo in self.runtime_algorithms:
yield (run_gradinput_runtime_algorithm, algo)
def test_gradweight_runtime_algorithms(self):
dtype = 'float32'
unit_shape = (1,) * self.ndim
_broadcastable = [False] * (2 + self.ndim)
def run_gradweight_runtime_algorithm(algo):
theano.config.dnn.conv.algo_bwd_filter = algo
inputs = theano.tensor.TensorType(dtype, _broadcastable)()
filters = theano.tensor.TensorType(dtype, _broadcastable)()
conv = dnn.dnn_conv(img=inputs, kerns=filters, algo=algo, precision=dtype,
subsample=unit_shape, dilation=unit_shape)
grad_w = theano.tensor.grad(conv.sum(), [filters])
f = theano.function([inputs, filters], grad_w, mode=mode_with_gpu)
assert 1 == len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, dnn.GpuDnnConvGradW)])
assert not any(isinstance(node.op, dnn.GpuDnnConv) for node in f.maker.fgraph.apply_nodes)
assert not any(isinstance(node.op, dnn.GpuDnnConvGradI) for node in f.maker.fgraph.apply_nodes)
if self.ndim == 3:
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters[:, :, ::-1, ::-1]
conv_ref = self.cpu_conv_class(subsample=unit_shape)(ref_cast(inputs), flipped_filters)
grad_w_ref = theano.tensor.grad(conv_ref.sum(), [filters])
f_ref = theano.function([inputs, filters], grad_w_ref, mode='FAST_RUN')
runtime_shapes = self.runtime_shapes
if algo in ('time_once', 'guess_once'):
runtime_shapes = [list(runtime_shapes[0])]
runtime_shapes[0][0] = 5
for ntimes, (inputs_shape, filters_shape) in runtime_shapes:
for i in range(ntimes):
inputs_val = np.random.random(inputs_shape).astype(dtype)
filters_val = np.random.random(filters_shape).astype(dtype)
gpu_res = f(inputs_val, filters_val)
cpu_res = f_ref(inputs_val, filters_val)
utt.assert_allclose(cpu_res, np.asarray(gpu_res))
for algo in self.runtime_algorithms:
yield (run_gradweight_runtime_algorithm, algo)
class TestDnnConv3DRuntimeAlgorithms(TestDnnConv2DRuntimeAlgorithms):
ndim = 3
cpu_conv_class = theano.tensor.nnet.corr3d.Corr3dMM
runtime_shapes = [
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]),
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]),
(3, [(2, 3, 5, 10, 9), (5, 3, 4, 7, 7)]), # cache should be used
(1, [(2, 2, 50, 50, 5), (5, 2, 25, 31, 4)]),
(1, [(1, 1, 5, 100, 200), (1, 1, 4, 50, 200)]), # cache should be used
(1, [(4, 2, 20, 20, 20), (2, 2, 20, 19, 18)]), # cache should be used
(1, [(1, 2, 3, 4, 5), (6, 2, 3, 2, 1)])
]
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论