提交 fa5590e6 authored 作者: notoraptor's avatar notoraptor

Make code safer and simpler.

上级 d3cb3ad4
...@@ -59,7 +59,6 @@ if (APPLY_SPECIFIC(kerns) != NULL) ...@@ -59,7 +59,6 @@ if (APPLY_SPECIFIC(kerns) != NULL)
#section support_code #section support_code
#include <sstream> #include <sstream>
#include <vector>
#include <string> #include <string>
#if __cplusplus < 201103L #if __cplusplus < 201103L
#include <tr1/unordered_map> #include <tr1/unordered_map>
...@@ -70,20 +69,17 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache; ...@@ -70,20 +69,17 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif #endif
#include "pthread.h" #include "pthread.h"
#line 69 "dnn_conv_base.c" #line 73 "dnn_conv_base.c"
using std::vector;
using std::string;
pthread_mutex_t algoMutex; pthread_mutex_t algoMutex;
AlgoCache algoCache; AlgoCache algoCache;
static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err) static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
{ {
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s", PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
cudnnGetErrorString(err)); msg, cudnnGetErrorString(err));
} }
return err; return err;
} }
...@@ -105,64 +101,69 @@ c_get_largest_free_block_size(PyGpuContextObject *c) ...@@ -105,64 +101,69 @@ c_get_largest_free_block_size(PyGpuContextObject *c)
static std::string shape(int* res, int size) static std::string shape(int* res, int size)
{ {
std::stringstream s; std::ostringstream s;
if (size>0) { if (size > 0) {
s<<res[0]; s << res[0];
for (int i=1; i< size; ++i) for (int i = 1; i < size; ++i)
s <<',' << res[i]; s <<',' << res[i];
} }
return std::string(s.str().c_str()); return s.str();
} }
static std::string shape(cudnnTensorDescriptor_t t) static std::string shape(cudnnTensorDescriptor_t t)
{ {
std::vector<int> res; // cuDNN can handle up to CUDNN_DIM_MAX dimensions.
std::vector<int> stride; int res[CUDNN_DIM_MAX];
int stride[CUDNN_DIM_MAX];
int nbDims; int nbDims;
cudnnDataType_t type; cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims,0,0)); checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
res.resize(nbDims); "error getting tensor description");
stride.resize(nbDims); if (PyErr_Occurred()) return "";
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data())); return shape(res, nbDims) + "," + shape(stride, nbDims);
return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
}; };
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type) static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{ {
cudnnTensorFormat_t format; cudnnTensorFormat_t format;
int sizes = 8; int res[CUDNN_DIM_MAX];
std::vector<int> res(sizes);
int outDims; int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data())); checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
return shape(&res[0], outDims); "error getting filter description");
if (PyErr_Occurred()) return "";
return shape(res, outDims);
}; };
static std::string shape(cudnnConvolutionDescriptor_t convDesc) static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{ {
const int maxDim = 5; int nDim;
int nDim=0;
cudnnConvolutionMode_t mode; cudnnConvolutionMode_t mode;
cudnnDataType_t computeType; cudnnDataType_t computeType;
int padA[maxDim]; int padA[5];
int strideA[maxDim]; int strideA[5];
int dilationA[maxDim]; int dilationA[5];
checkCudnnStatus( checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, maxDim, cudnnGetConvolutionNdDescriptor( convDesc, 5,
&nDim, &nDim,
&padA[0], &padA[0],
&strideA[0], &strideA[0],
&dilationA[0], &dilationA[0],
&mode, &mode,
&computeType )); &computeType ),
"error getting convolution description");
if (PyErr_Occurred()) return "";
return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim) + " -dilationA " + shape(dilationA, nDim); return (std::string("-mode ") +
((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
" -pad " +
shape(padA, nDim) +
" -subsample " +
shape(strideA, nDim) +
" -dilation " +
shape(dilationA, nDim));
} }
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter) static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
...@@ -182,7 +183,7 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO ...@@ -182,7 +183,7 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO
PyGpuArrayObject* output, int groups) PyGpuArrayObject* output, int groups)
{ {
cudnnDataType_t dType; cudnnDataType_t dType;
std::stringstream s; std::ostringstream s;
int expected_output_dims[5] = {0}; int expected_output_dims[5] = {0};
cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc, cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
PyGpuArray_NDIM(filter), expected_output_dims); PyGpuArray_NDIM(filter), expected_output_dims);
...@@ -221,16 +222,20 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO ...@@ -221,16 +222,20 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO
return ""; return "";
} }
} }
std::string shapeInput = shape(inputDesc);
s << "-g" << groups << " -dimA" << shape(inputDesc) << " -filtA" << std::string shapeFilter = shape(filterDesc, &dType);
shape(filterDesc, &dType) << shape(convDesc); std::string shapeConvDesc = shape(convDesc);
if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
return "";
s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
shapeFilter << " " << shapeConvDesc;
// there have to be entries for both aligned and not // there have to be entries for both aligned and not
if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter))) if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
{ {
s << " [unaligned] "; s << " [unaligned]";
} }
return std::string(s.str().c_str()); return s.str();
} }
static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec) static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
...@@ -240,15 +245,11 @@ static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec) ...@@ -240,15 +245,11 @@ static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
pthread_mutex_unlock(&algoMutex); pthread_mutex_unlock(&algoMutex);
} }
static const AlgoRec* dnn_conv_check_cache(const std::string& hash) static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{ {
pthread_mutex_lock(&algoMutex); pthread_mutex_lock(&algoMutex);
bool cacheHit = false;
const AlgoRec* ret = 0; const AlgoRec* ret = 0;
// cout << "dnn_conv_check_cache: "<< hash << endl;
AlgoCache::iterator hit = algoCache.find(hash); AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end()) if (hit != algoCache.end())
...@@ -257,4 +258,3 @@ static const AlgoRec* dnn_conv_check_cache(const std::string& hash) ...@@ -257,4 +258,3 @@ static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
pthread_mutex_unlock(&algoMutex); pthread_mutex_unlock(&algoMutex);
return ret; return ret;
} }
#section init_code_struct #section init_code_struct
reuse_algo = 0; reuse_algo = 0;
use_cached = 0;
prev_algo.algo = PARAMS->conv_algo; prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT; prev_algo.dataType = CUDNN_DATA_FLOAT;
hash_prefix = std::string("FW| GPU#"); hash_prefix = std::string("FWD|GPU#");
#section support_code_struct #section support_code_struct
#line 12 "dnn_fwd.c" #line 11 "dnn_fwd.c"
int reuse_algo; int reuse_algo;
bool use_cached; bool use_cached;
AlgoRec prev_algo; AlgoRec prev_algo;
...@@ -72,7 +73,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -72,7 +73,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
return 0; return 0;
} }
int groups = c_check_groups_for_conv(desc, params->num_groups); int groups = c_check_groups_for_conv(desc, params->num_groups);
if (groups == -1) if (groups == -1)
return 1; return 1;
...@@ -87,28 +88,29 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -87,28 +88,29 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey; std::string hashkey;
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
size_t free = c_get_largest_free_block_size(c); size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
if (params->choose_algo) { if (params->choose_algo) {
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups); hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
if (hashkey.empty()) if (hashkey.empty())
return 1; return 1;
hashkey = hash_prefix + pci_id + hashkey; hashkey = hash_prefix + pci_id + " " + hashkey;
// check out cache // check out cache
const AlgoRec* cached = dnn_conv_check_cache(hashkey); const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) { if (cached) {
...@@ -116,17 +118,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -116,17 +118,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
use_cached = 1; use_cached = 1;
} }
} }
if (reuse_algo || use_cached) { if (reuse_algo || use_cached) {
algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo; algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize; worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType; mathtype = prev_algo.mathType;
} else { } else {
if (params->choose_time) { if (params->choose_time) {
int count; int count;
cudnnConvolutionFwdAlgoPerf_t choice; cudnnConvolutionFwdAlgoPerf_t choice;
gpudata *tmpmem; gpudata *tmpmem;
tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL);
if (tmpmem == NULL) { if (tmpmem == NULL) {
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
...@@ -142,9 +144,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -142,9 +144,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
free); free);
gpudata_release(tmpmem); gpudata_release(tmpmem);
// fprintf(stderr, "(cudnnFindConvolutionForwardAlgorithmEx: (err:%d), algo: %d, mem: %ld, free: %ld\n",
// err, choice.algo, choice.memory, free);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -152,14 +151,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -152,14 +151,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
...@@ -173,27 +164,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -173,27 +164,37 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} // Else, count is necessarly 1 for current implementation. } // Else, count is necessarly 1 for current implementation.
#endif #endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
// NB: It is added again later to cqche,
// so maybe this line could be removed.
} else { } else {
err = cudnnGetConvolutionForwardAlgorithm( err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
prev_algo.algo = algo; prev_algo.algo = algo;
// no tensor_op returned from Get() // no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH; prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
// fprintf(stderr, "(cudnnGetConvolutionForwardAlgorithm: (err:%d), algo: %d\n", err, algo);
} }
} }
} }
// if FindEx was used (choose_time), workspace size is set. // if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time)) if (!(reuse_algo || use_cached || params->choose_time))
{ {
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
...@@ -203,7 +204,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -203,7 +204,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
algo, algo,
&worksize); &worksize);
if (err == CUDNN_STATUS_NOT_SUPPORTED) { if (err == CUDNN_STATUS_NOT_SUPPORTED) {
// Fallback to none algo if not supported // Fallback to none algo if not supported
#ifdef DEBUG #ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
...@@ -222,7 +223,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -222,7 +223,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
algo, algo,
&worksize); &worksize);
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error getting worksize: %s", "error getting worksize: %s",
...@@ -232,33 +233,35 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -232,33 +233,35 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
// save worksize for next time/cache // save worksize for next time/cache
prev_algo.wsSize = worksize; prev_algo.wsSize = worksize;
// Add to the cache // Add to the cache, even if this node use *_once algo
// (in case the user specify the algo per layer and not globally).
if (params->choose_algo) if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo); dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (params->choose_algo) { if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1; return 1;
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n", fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
params->choose_algo ? "[A]": "" , algorithm_name,
params->choose_time ? "[T]": "" , params->choose_time ? "(timed)": "" ,
algo, // algorithm_name,
reuse_algo ? "(reused)" : "", reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str() mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
); );
} }
#endif #endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} }
{ {
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7 #if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type // CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType); err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -269,7 +272,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
#endif #endif
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
...@@ -284,7 +287,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -284,7 +287,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
} }
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
...@@ -308,7 +311,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -308,7 +311,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
} }
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
......
...@@ -3,15 +3,16 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,15 +3,16 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT; prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("GI| GPU#"); use_cached = 0;
#section support_code_struct hash_prefix = std::string("GI|GPU#");
#line 12 "dnn_gi.c" #section support_code_struct
#line 11 "dnn_gi.c"
int reuse_algo; int reuse_algo;
bool use_cached; bool use_cached;
AlgoRec prev_algo; AlgoRec prev_algo;
std::string hash_prefix; std::string hash_prefix;
int int
APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
PyGpuArrayObject *im, PyGpuArrayObject *im,
...@@ -72,7 +73,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -72,7 +73,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 0; return 0;
} }
int groups = c_check_groups_for_conv(desc, params->num_groups); int groups = c_check_groups_for_conv(desc, params->num_groups);
if (groups == -1) if (groups == -1)
return 1; return 1;
...@@ -90,19 +91,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -90,19 +91,19 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey; std::string hashkey;
if (params->choose_algo && !reuse_algo) { if (params->choose_algo && !reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
// check out cache // check out cache
hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups); hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty()) if (hashkey.empty())
return 1; return 1;
hashkey = hash_prefix + pci_id + hashkey; hashkey = hash_prefix + pci_id + " " + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey); const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) { if (cached) {
prev_algo = *cached; prev_algo = *cached;
...@@ -111,9 +112,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -111,9 +112,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
size_t free = c_get_largest_free_block_size(c); size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx); cuda_enter(c->ctx);
if (params->choose_algo && !(reuse_algo || use_cached)) { if (params->choose_algo && !(reuse_algo || use_cached)) {
if (params->choose_time) { if (params->choose_time) {
int count; int count;
...@@ -140,15 +142,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -140,15 +142,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
} }
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found"); PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
...@@ -161,6 +154,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -161,6 +154,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} // Else, count is necessarly 1 for current implementation. } // Else, count is necessarly 1 for current implementation.
#endif #endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else { } else {
err = cudnnGetConvolutionBackwardDataAlgorithm( err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
...@@ -177,11 +179,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -177,11 +179,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH; prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
} }
} }
// if FindEx was used (choose_time), workspace size is set. // if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time)) if (!(reuse_algo || use_cached || params->choose_time))
{ {
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize); APPLY_SPECIFIC(input), algo, &worksize);
...@@ -200,7 +202,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -200,7 +202,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
// defined only for 2d filters // defined only for 2d filters
if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || if ((algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) { algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) && PyGpuArray_NDIM(kerns) == 4) {
// Extract the properties of the convolution descriptor // Extract the properties of the convolution descriptor
int nd; int nd;
int pad[2]; int pad[2];
...@@ -217,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -217,7 +219,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)
{ {
if (stride[0] != 1 || stride[1] != 1 || if (stride[0] != 1 || stride[1] != 1 ||
...@@ -240,31 +242,32 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -240,31 +242,32 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(input), algo, &worksize); APPLY_SPECIFIC(input), algo, &worksize);
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
// save worksize for next time/cache // save worksize for next time/cache
prev_algo.wsSize = worksize; prev_algo.wsSize = worksize;
// Add to the cache // Add to the cache
if (params->choose_algo) if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo); dnn_conv_update_cache(hashkey, prev_algo);
} // !(reuse_algo || use_cached || params->choose_time) } // !(reuse_algo || use_cached || params->choose_time)
#ifdef DEBUG #ifdef DEBUG
if (params->choose_algo) { if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
return 1; return 1;
// NB: This is printed only when algorithm is chosen at runtime. // NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n", fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
params->choose_algo ? "[A]": "" , algorithm_name,
params->choose_time ? "[T]": "" , params->choose_time ? "(timed)": "" ,
algo, // algorithm_name,
reuse_algo ? "(reused)" : "", reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str() mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
); );
} }
#endif #endif
...@@ -272,9 +275,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -272,9 +275,9 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} }
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7 #if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type // CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType); err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -285,7 +288,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -285,7 +288,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1; return 1;
} }
#endif #endif
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
......
...@@ -3,11 +3,11 @@ prev_algo.algo = PARAMS->conv_algo; ...@@ -3,11 +3,11 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH; prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT; prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0; reuse_algo = 0;
hash_prefix = std::string("GW| GPU#"); use_cached = 0;
hash_prefix = std::string("GW|GPU#");
#section support_code_struct #section support_code_struct
#line 11 "dnn_gw.c" #line 11 "dnn_gw.c"
int reuse_algo; int reuse_algo;
bool use_cached; bool use_cached;
AlgoRec prev_algo; AlgoRec prev_algo;
...@@ -91,16 +91,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -91,16 +91,17 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
size_t worksize = 0; size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH; cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey ; std::string hashkey ;
size_t free = c_get_largest_free_block_size(c); size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
cuda_enter(c->ctx);
if (params->choose_algo) { if (params->choose_algo) {
if (!reuse_algo) { if (!reuse_algo) {
char pci_id[16]; char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id); gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
...@@ -115,12 +116,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -115,12 +116,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
use_cached = 1; use_cached = 1;
} }
} }
if (reuse_algo || use_cached) { if (reuse_algo || use_cached) {
algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo; algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
worksize = prev_algo.wsSize; worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType; mathtype = prev_algo.mathType;
} else { } else {
if (params->choose_time) { if (params->choose_time) {
int count; int count;
...@@ -148,15 +149,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -148,15 +149,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1; return 1;
} }
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found"); PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
...@@ -169,6 +161,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -169,6 +161,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} // Else, count is necessarly 1 for current implementation. } // Else, count is necessarly 1 for current implementation.
#endif #endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else { } else {
err = cudnnGetConvolutionBackwardFilterAlgorithm( err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
...@@ -181,73 +182,74 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -181,73 +182,74 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
prev_algo.algo = algo; prev_algo.algo = algo;
// no tensor_op returned from Get() // no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH; prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
} }
} }
} /* choose_algo */ } /* choose_algo */
// if FindEx was used (choose_time), workspace size is set. // if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time)) if (!(reuse_algo || use_cached || params->choose_time))
{ {
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
#ifdef DEBUG #ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1; return 1;
fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0", fprintf(stderr, "(%s error getting worksize:%s, falling back to CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0",
algorithm_name, cudnnGetErrorString(err)); algorithm_name, cudnnGetErrorString(err));
#endif #endif
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
APPLY_SPECIFIC(kerns), algo, &worksize); APPLY_SPECIFIC(kerns), algo, &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
} }
// save worksize for next time/cache // save worksize for next time/cache
prev_algo.wsSize = worksize; prev_algo.wsSize = worksize;
// Add to the cache // Add to the cache
if (params->choose_algo) if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo); dnn_conv_update_cache(hashkey, prev_algo);
} }
#ifdef DEBUG #ifdef DEBUG
if (params->choose_algo) { if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name)) if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1; return 1;
// NB: This is printed only when algorithm is chosen at runtime. // NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n", fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
params->choose_algo ? "[A]": "" , algorithm_name,
params->choose_time ? "[T]": "" , params->choose_time ? "(timed)": "" ,
algo, // algorithm_name,
reuse_algo ? "(reused)" : "", reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "", use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str() mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
); worksize,
hashkey.c_str()
);
} }
#endif #endif
if (params->choose_once) { if (params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
} }
gpudata *workspace = 0; gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7 #if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type // CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, mathtype); err = cudnnSetConvolutionMathType(desc, mathtype);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
......
...@@ -399,7 +399,7 @@ class DnnBase(COp): ...@@ -399,7 +399,7 @@ class DnnBase(COp):
return [] return []
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 3) return (super(DnnBase, self).c_code_cache_version(), version(), 1)
class GpuDnnConvDesc(COp): class GpuDnnConvDesc(COp):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论