提交 c7e02f24 authored 作者: Boris Fomitchev's avatar Boris Fomitchev 提交者: notoraptor

CUDNN7 grouped convolutions

上级 bacc5f6f
...@@ -3,6 +3,23 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input); ...@@ -3,6 +3,23 @@ cudnnTensorDescriptor_t APPLY_SPECIFIC(input);
cudnnTensorDescriptor_t APPLY_SPECIFIC(output); cudnnTensorDescriptor_t APPLY_SPECIFIC(output);
cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
static int c_set_groups_for_conv(cudnnConvolutionDescriptor_t desc, int groups) {
#if CUDNN_MAJOR >= 7
if (groups > 1) {
cudnnStatus_t err = cudnnSetConvolutionGroupCount(desc, groups);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting groups for convolution : %s",
cudnnGetErrorString(err));
return -1;
}
}
return 1;
#else
return groups;
#endif
}
#section init_code_struct #section init_code_struct
cudnnStatus_t APPLY_SPECIFIC(err); cudnnStatus_t APPLY_SPECIFIC(err);
......
...@@ -13,7 +13,7 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache; ...@@ -13,7 +13,7 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif #endif
#include "pthread.h" #include "pthread.h"
#line 10 "dnn_conv_find.cc" #line 10 "dnn_conv_find.c"
using std::vector; using std::vector;
using std::string; using std::string;
...@@ -96,26 +96,64 @@ static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter) ...@@ -96,26 +96,64 @@ static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
return true; return true;
} }
std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in, static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayObject* input,
cudnnFilterDescriptor_t filterDesc, void* filter, cudnnFilterDescriptor_t filterDesc, PyGpuArrayObject* filter,
cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionDescriptor_t convDesc,
void* out) PyGpuArrayObject* output, int groups)
{ {
cudnnDataType_t dType; cudnnDataType_t dType;
std::stringstream s; std::stringstream s;
int expected_output_dims[5] = {0};
cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
PyGpuArray_NDIM(filter), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
return "";
}
if (PyGpuArray_NDIM(filter) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %dx%dx% dx%d",
expected_output_dims[0], expected_output_dims[1] / groups,
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
return "";
}
} else if (PyGpuArray_NDIM(filter) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
return "";
}
}
s << " -dimA" << shape(input) << " -filtA" << shape(filterDesc, &dType) << shape(convDesc); s << "-g" << groups << " -dimA" << shape(inputDesc) << " -filtA" <<
shape(filterDesc, &dType) << shape(convDesc);
// there have to be entries for both aligned and not // there have to be entries for both aligned and not
if (!all_aligned(dType, in, out, filter)) if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
{ {
s << " [unaligned] "; s << " [unaligned] ";
} }
return std::string(s.str().c_str()); return std::string(s.str().c_str());
} }
void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec) static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{ {
pthread_mutex_lock(&algoMutex); pthread_mutex_lock(&algoMutex);
algoCache[hash] = rec; algoCache[hash] = rec;
...@@ -123,7 +161,7 @@ void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec) ...@@ -123,7 +161,7 @@ void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
} }
const AlgoRec* dnn_conv_check_cache(const std::string& hash) static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{ {
pthread_mutex_lock(&algoMutex); pthread_mutex_lock(&algoMutex);
bool cacheHit = false; bool cacheHit = false;
......
#section support_code
#include <cuda.h>
#include <mutex>
#include <sstream>
#include <vector>
#include <string>
#include <unordered_map>
#include "dnn_conv_find.h"
#line 10 "dnn_conv_find.cc"
using std::vector;
using std::string;
using std::unique_lock;
using std::mutex;
typedef std::unordered_map<string, AlgoRec> AlgoCache;
mutex algoMutex;
AlgoCache algoCache;
static std::string shape(int* res, int size)
{
std::stringstream s;
if (size>0) {
s<<res[0];
for (int i=1; i< size; ++i)
s <<',' << res[i];
}
return std::string(s.str().c_str());
}
static std::string shape(cudnnTensorDescriptor_t t)
{
std::vector<int> res;
std::vector<int> stride;
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims, nullptr, nullptr));
res.resize(nbDims);
stride.resize(nbDims);
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int sizes = 8;
std::vector<int> res(sizes);
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
return shape(&res[0], outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{
const int maxDim = 5;
int nDim=0;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[maxDim];
int strideA[maxDim];
int dilationA[maxDim];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ));
return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim) + " -dilationA " + shape(dilationA, nDim);
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
{
size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
// there have to be entries for both aligned and not
if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
{
return false;
}
return true;
}
std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
cudnnFilterDescriptor_t filterDesc, void* filter,
cudnnConvolutionDescriptor_t convDesc,
void* out)
{
cudnnDataType_t dType;
std::stringstream s;
s << " -dimA" << shape(input) << " -filtA" << shape(filterDesc, &dType) << shape(convDesc);
// there have to be entries for both aligned and not
if (!all_aligned(dType, in, out, filter))
{
s << " [unaligned] ";
}
return std::string(s.str().c_str());
}
void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{
unique_lock<mutex> lock(algoMutex);
algoCache[hash] = rec;
}
const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
unique_lock<mutex> lock(algoMutex);
bool cacheHit = false;
// cout << "dnn_conv_check_cache: "<< hash << endl;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
return &hit->second;
return nullptr;
}
...@@ -7,12 +7,13 @@ ...@@ -7,12 +7,13 @@
enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 }; enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
#endif #endif
inline void checkCudnnStatus(cudnnStatus_t err) inline cudnnStatus_t checkCudnnStatus(cudnnStatus_t err)
{ {
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s", PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
} }
return err;
} }
...@@ -24,13 +25,5 @@ struct AlgoRec { ...@@ -24,13 +25,5 @@ struct AlgoRec {
cudnnMathType_t mathType; cudnnMathType_t mathType;
}; };
const AlgoRec* dnn_conv_check_cache(const std::string&);
std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
cudnnFilterDescriptor_t filterDesc, void* filter,
cudnnConvolutionDescriptor_t convDesc,
void* out);
void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec);
...@@ -77,27 +77,31 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -77,27 +77,31 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 0; return 0;
} }
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) int groups = c_set_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
return 1; return 1;
if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), groups) == -1)
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; return 1;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey = "F| GPU#"; std::string hashkey;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
if (params->choose_algo) { if (params->choose_algo) {
if (!params->choose_once) { if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
...@@ -109,15 +113,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -109,15 +113,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
} }
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
hashkey += pci_id;
hashkey += dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, PyGpuArray_DEV_DATA(*output));
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
if (hashkey.empty())
return 1;
hashkey = std::string("F| GPU#") + pci_id + hashkey;
// check out cache // check out cache
const AlgoRec* cached = dnn_conv_check_cache(hashkey); const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) { if (cached) {
...@@ -395,7 +398,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -395,7 +398,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) { for ( int g = 0; g < groups; g++) {
err = cudnnConvolutionForward( err = cudnnConvolutionForward(
params->handle, params->handle,
alpha_p, alpha_p,
......
...@@ -75,15 +75,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -75,15 +75,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1)
int groups = c_set_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1;
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), params->num_groups) == -1) if (c_set_tensor_for_conv(*input, APPLY_SPECIFIC(input), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(*input, 0) / params->num_groups; size_t input_offset = PyGpuArray_STRIDE(*input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo; cudnnConvolutionBwdDataAlgo_t algo = params->conv_algo;
#ifdef DEBUG #ifdef DEBUG
...@@ -92,52 +96,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -92,52 +96,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(im), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(im) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %dx%dx% dx%d",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(im) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
std::string hashkey; std::string hashkey;
if (params->choose_algo) { if (params->choose_algo) {
...@@ -151,12 +109,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -151,12 +109,16 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache // check out cache
hashkey = std::string("GI | GPU#") + pci_id + hashkey+=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input), if (hashkey.empty())
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), return 1;
desc, PyGpuArray_DEV_DATA(output) hashkey = std::string("GI| GPU#") + pci_id + hashkey;
);
const AlgoRec* cached = dnn_conv_check_cache(hashkey); const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) { if (cached) {
prev_algo = *cached; prev_algo = *cached;
...@@ -164,6 +126,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -164,6 +126,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
cuda_enter(c->ctx);
if (!(reuse_algo || use_cached)) { if (!(reuse_algo || use_cached)) {
size_t free; size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
...@@ -358,7 +322,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -358,7 +322,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) for ( int g = 0; g < groups; g++)
{ {
err = cudnnConvolutionBackwardData( err = cudnnConvolutionBackwardData(
params->handle, params->handle,
......
...@@ -7,7 +7,7 @@ memset(prev_img_dims, 0, sizeof(prev_img_dims)); ...@@ -7,7 +7,7 @@ memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims)); memset(prev_top_dims, 0, sizeof(prev_top_dims));
#section support_code_struct #section support_code_struct
#line 12 "dnn_gw.c" #line 11 "dnn_gw.c"
#include "dnn_conv_find.h" #include "dnn_conv_find.h"
int reuse_algo; int reuse_algo;
bool use_cached; bool use_cached;
...@@ -75,16 +75,19 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -75,16 +75,19 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 0; return 0;
} }
if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) int groups = c_set_groups_for_conv(desc, params->num_groups);
if (groups == -1)
return 1; return 1;
if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), params->num_groups) == -1) if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), groups) == -1)
return 1; return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) if (c_set_tensor_for_conv(output, APPLY_SPECIFIC(output), groups) == -1)
return 1;
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns), groups) == -1)
return 1; return 1;
size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t input_offset = PyGpuArray_STRIDE(input, 0) / groups;
size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(*kerns, 0) * PyGpuArray_DIM(*kerns, 0) / groups;
size_t output_offset = PyGpuArray_STRIDE(output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(output, 0) / groups;
cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo; cudnnConvolutionBwdFilterAlgo_t algo = params->conv_algo;
#ifdef DEBUG #ifdef DEBUG
...@@ -93,52 +96,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -93,52 +96,9 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey = "GW | GPU#"; std::string hashkey ;
cuda_enter(c->ctx); cuda_enter(c->ctx);
int expected_output_dims[5] = {0};
err = cudnnGetConvolutionNdForwardOutputDim(desc, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
PyGpuArray_NDIM(input), expected_output_dims);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error computing convolution output dim: %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
if (PyGpuArray_NDIM(input) == 4) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
" but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3]);
cuda_exit(c->ctx);
return 1;
}
} else if (PyGpuArray_NDIM(input) == 5) {
if ((PyGpuArray_DIMS(output)[0] != expected_output_dims[0]) ||
(PyGpuArray_DIMS(output)[1] != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3],
expected_output_dims[4],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
PyGpuArray_DIMS(output)[2], PyGpuArray_DIMS(output)[3],
PyGpuArray_DIMS(output)[4]);
cuda_exit(c->ctx);
return 1;
}
}
if (params->choose_algo) { if (params->choose_algo) {
if (!params->choose_once) { if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
...@@ -150,16 +110,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -150,16 +110,13 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
hashkey = pci_id; hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), *kerns, desc, output, groups);
if (hashkey.empty())
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), return 1;
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns), hashkey = std::string("GW| GPU#") + pci_id + hashkey;
desc, PyGpuArray_DEV_DATA(output)
);
if (!reuse_algo) {
// check out cache // check out cache
const AlgoRec* cached = dnn_conv_check_cache(hashkey); const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) { if (cached) {
...@@ -333,7 +290,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -333,7 +290,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7 #if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type // CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType); err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s", "error setting math type for convolution : %s",
...@@ -355,7 +312,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -355,7 +312,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
for ( int g = 0; g < params->num_groups; g++) for ( int g = 0; g < groups; g++)
{ {
err = cudnnConvolutionBackwardFilter( err = cudnnConvolutionBackwardFilter(
......
...@@ -399,7 +399,7 @@ class DnnBase(COp): ...@@ -399,7 +399,7 @@ class DnnBase(COp):
return [] return []
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 2) return (super(DnnBase, self).c_code_cache_version(), version(), 3)
class GpuDnnConvDesc(COp): class GpuDnnConvDesc(COp):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论