提交 b998dc61 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6302 from borisfom/tensor_op

Tensor op, cache
#section support_code_apply
static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
cudnnConvolutionDescriptor_t *desc,
PARAMS_TYPE* params) {
......@@ -43,5 +56,7 @@ int APPLY_SPECIFIC(conv_desc)(PyArrayObject *filt_shp,
"descriptor: %s", cudnnGetErrorString(err));
return -1;
}
if (c_set_groups_for_conv(*desc, params->num_groups) == -1)
return -1;
return 0;
}
......@@ -11,6 +11,14 @@ static inline int cudnnGetVersion() {
}
#endif
#if CUDNN_MAJOR < 7
enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
#endif
/* a common struct for all 3 CUDNN enums */
struct AlgoRec {
int algo;
size_t wsSize;
cudnnMathType_t mathType;
};
#endif
......@@ -3,6 +3,43 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
static int c_get_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
int desc_groups;
if (groups > 1) {
cudnnStatus_t err = cudnnGetConvolutionGroupCount(desc, &desc_groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
if (groups != desc_groups) {
PyErr_SetString(PyExc_MemoryError,
"groups specified different from convolution descriptor");
return -1;
}
}
return 1;
#else
return groups;
#endif
}
static int c_set_math_type_for_conv(cudnnConvolutionDescriptor_t desc, cudnnMathType_t mathtype) {
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
cudnnStatus_t err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
#endif
return 0;
}
#section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err);
......@@ -20,7 +57,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output)))
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
PyErr_Format(PyExc_MemoryError, "could not allocate filter descriptor: %s",
cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
......@@ -33,3 +70,220 @@ if (APPLY_SPECIFIC(output) != NULL)
cudnnDestroyTensorDescriptor(APPLY_SPECIFIC(output));
if (APPLY_SPECIFIC(kerns) != NULL)
cudnnDestroyFilterDescriptor(APPLY_SPECIFIC(kerns));
#section support_code
#include <sstream>
#include <string>
#if __cplusplus < 201103L
#include <tr1/unordered_map>
typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
#else
#include <unordered_map>
typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif
#include "pthread.h"
#line 87 "dnn_conv_base.c"
pthread_mutex_t algoMutex;
AlgoCache algoCache;
static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
{
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
msg, cudnnGetErrorString(err));
}
return err;
}
static size_t
c_get_largest_free_block_size(PyGpuContextObject *c)
{
size_t maxfree = 0;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &maxfree);
if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU");
}
// Guess 4Mb if the info is not available
if (maxfree == 0) maxfree = 4 * 1024 * 1024;
return maxfree;
}
/** Check if convolution output tensor has expected dimensions
depending on given inputs and number of groups.
return 0 if everything is ok, non-0 on error.
**/
static int dnn_check_convolution_output(cudnnConvolutionDescriptor_t convDesc,
cudnnTensorDescriptor_t inputDesc,
cudnnFilterDescriptor_t filterDesc,
size_t tensorNdim,
PyGpuArrayObject* output,
int groups) {
int expected_output_dims[5] = {0};
cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
tensorNdim, expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
return 1;
}
if (tensorNdim == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
" but received %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1] * groups,
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
return 1;
}
} else if (tensorNdim == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1] * groups,
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
return 1;
}
}
return 0;
}
static std::string shape(int* res, int size)
{
std::ostringstream s;
if (size > 0) {
s << res[0];
for (int i = 1; i < size; ++i)
s <<',' << res[i];
}
return s.str();
}
static std::string shape(cudnnTensorDescriptor_t t)
{
// cuDNN can handle up to CUDNN_DIM_MAX dimensions.
int res[CUDNN_DIM_MAX];
int stride[CUDNN_DIM_MAX];
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
"error getting tensor description");
if (PyErr_Occurred()) return "";
return shape(res, nbDims) + "," + shape(stride, nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int res[CUDNN_DIM_MAX];
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
"error getting filter description");
if (PyErr_Occurred()) return "";
return shape(res, outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{
int nDim;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[5];
int strideA[5];
int dilationA[5];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, 5,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ),
"error getting convolution description");
if (PyErr_Occurred()) return "";
return (std::string("-mode ") +
((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
" -pad " +
shape(padA, nDim) +
" -subsample " +
shape(strideA, nDim) +
" -dilation " +
shape(dilationA, nDim));
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
{
size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
// there have to be entries for both aligned and not
if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
{
return false;
}
return true;
}
static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
cudnnConvolutionDescriptor_t convDesc,
PyGpuArrayObject* output, int groups)
{
cudnnDataType_t dType;
std::ostringstream s;
int expected_output_dims[5] = {0};
if (dnn_check_convolution_output(convDesc, inputDesc, filterDesc, PyGpuArray_NDIM(filter), output, groups) != 0)
return "";
std::string shapeInput = shape(inputDesc);
std::string shapeFilter = shape(filterDesc, &dType);
std::string shapeConvDesc = shape(convDesc);
if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
return "";
s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
shapeFilter << " " << shapeConvDesc;
// there have to be entries for both aligned and not.
if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
{
s << " [unaligned]";
}
return s.str();
}
static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{
pthread_mutex_lock(&algoMutex);
algoCache[hash] = rec;
pthread_mutex_unlock(&algoMutex);
}
static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
pthread_mutex_lock(&algoMutex);
const AlgoRec* ret = 0;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
ret = &hit->second;
pthread_mutex_unlock(&algoMutex);
return ret;
}
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论