提交 fa5590e6 authored 作者: notoraptor's avatar notoraptor

Make code safer and simpler.

上级 d3cb3ad4
......@@ -59,7 +59,6 @@ if (APPLY_SPECIFIC(kerns) != NULL)
#section support_code
#include <sstream>
#include <vector>
#include <string>
#if __cplusplus < 201103L
#include <tr1/unordered_map>
......@@ -70,19 +69,16 @@ typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif
#include "pthread.h"
#line 69 "dnn_conv_base.c"
using std::vector;
using std::string;
#line 73 "dnn_conv_base.c"
pthread_mutex_t algoMutex;
AlgoCache algoCache;
static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err)
static cudnnStatus_t checkCudnnStatus(cudnnStatus_t err, const char* msg)
{
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
cudnnGetErrorString(err));
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s: %s",
msg, cudnnGetErrorString(err));
}
return err;
}
......@@ -105,64 +101,69 @@ c_get_largest_free_block_size(PyGpuContextObject *c)
static std::string shape(int* res, int size)
{
std::stringstream s;
if (size>0) {
std::ostringstream s;
if (size > 0) {
s<<res[0];
for (int i=1; i< size; ++i)
s << res[0];
for (int i = 1; i < size; ++i)
s <<',' << res[i];
}
return std::string(s.str().c_str());
return s.str();
}
static std::string shape(cudnnTensorDescriptor_t t)
{
std::vector<int> res;
std::vector<int> stride;
// cuDNN can handle up to CUDNN_DIM_MAX dimensions.
int res[CUDNN_DIM_MAX];
int stride[CUDNN_DIM_MAX];
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims,0,0));
res.resize(nbDims);
stride.resize(nbDims);
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, CUDNN_DIM_MAX, &type, &nbDims, res, stride),
"error getting tensor description");
if (PyErr_Occurred()) return "";
return shape(res, nbDims) + "," + shape(stride, nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int sizes = 8;
std::vector<int> res(sizes);
int res[CUDNN_DIM_MAX];
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
return shape(&res[0], outDims);
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, CUDNN_DIM_MAX, type, &format, &outDims, res),
"error getting filter description");
if (PyErr_Occurred()) return "";
return shape(res, outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{
const int maxDim = 5;
int nDim=0;
int nDim;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[maxDim];
int strideA[maxDim];
int dilationA[maxDim];
int padA[5];
int strideA[5];
int dilationA[5];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
cudnnGetConvolutionNdDescriptor( convDesc, 5,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ));
return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim) + " -dilationA " + shape(dilationA, nDim);
&computeType ),
"error getting convolution description");
if (PyErr_Occurred()) return "";
return (std::string("-mode ") +
((mode == CUDNN_CONVOLUTION) ? "conv" : "cross") +
" -pad " +
shape(padA, nDim) +
" -subsample " +
shape(strideA, nDim) +
" -dilation " +
shape(dilationA, nDim));
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
......@@ -182,7 +183,7 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO
PyGpuArrayObject* output, int groups)
{
cudnnDataType_t dType;
std::stringstream s;
std::ostringstream s;
int expected_output_dims[5] = {0};
cudnnStatus_t err = cudnnGetConvolutionNdForwardOutputDim(convDesc, inputDesc, filterDesc,
PyGpuArray_NDIM(filter), expected_output_dims);
......@@ -221,16 +222,20 @@ static std::string dnn_conv_shape(cudnnTensorDescriptor_t inputDesc, PyGpuArrayO
return "";
}
}
s << "-g" << groups << " -dimA" << shape(inputDesc) << " -filtA" <<
shape(filterDesc, &dType) << shape(convDesc);
std::string shapeInput = shape(inputDesc);
std::string shapeFilter = shape(filterDesc, &dType);
std::string shapeConvDesc = shape(convDesc);
if (shapeInput.empty() || shapeFilter.empty() || shapeConvDesc.empty())
return "";
s << "-g " << groups << " -dim " << shapeInput << " -filt " <<
shapeFilter << " " << shapeConvDesc;
// there have to be entries for both aligned and not
if (!all_aligned(dType, PyGpuArray_DEV_DATA(input), PyGpuArray_DEV_DATA(output), PyGpuArray_DEV_DATA(filter)))
{
s << " [unaligned] ";
s << " [unaligned]";
}
return std::string(s.str().c_str());
return s.str();
}
static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
......@@ -240,15 +245,11 @@ static void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
pthread_mutex_unlock(&algoMutex);
}
static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
pthread_mutex_lock(&algoMutex);
bool cacheHit = false;
const AlgoRec* ret = 0;
// cout << "dnn_conv_check_cache: "<< hash << endl;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
......@@ -257,4 +258,3 @@ static const AlgoRec* dnn_conv_check_cache(const std::string& hash)
pthread_mutex_unlock(&algoMutex);
return ret;
}
#section init_code_struct
reuse_algo = 0;
use_cached = 0;
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
hash_prefix = std::string("FW| GPU#");
hash_prefix = std::string("FWD|GPU#");
#section support_code_struct
#line 12 "dnn_fwd.c"
#line 11 "dnn_fwd.c"
int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
......@@ -97,6 +98,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
#endif
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
......@@ -108,7 +110,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), input, APPLY_SPECIFIC(kerns), kerns, desc, *output, groups);
if (hashkey.empty())
return 1;
hashkey = hash_prefix + pci_id + hashkey;
hashkey = hash_prefix + pci_id + " " + hashkey;
// check out cache
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
......@@ -142,9 +144,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
free);
gpudata_release(tmpmem);
// fprintf(stderr, "(cudnnFindConvolutionForwardAlgorithmEx: (err:%d), algo: %d, mem: %ld, free: %ld\n",
// err, choice.algo, choice.memory, free);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
......@@ -152,14 +151,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx);
return 1;
}
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG
if (count == 0) {
......@@ -173,6 +164,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
// NB: It is added again later to cqche,
// so maybe this line could be removed.
} else {
err = cudnnGetConvolutionForwardAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
......@@ -188,7 +190,6 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
// fprintf(stderr, "(cudnnGetConvolutionForwardAlgorithm: (err:%d), algo: %d\n", err, algo);
}
}
}
......@@ -233,7 +234,8 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache
// Add to the cache, even if this node use *_once algo
// (in case the user specify the algo per layer and not globally).
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo);
......@@ -241,13 +243,14 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
params->choose_algo ? "[A]": "" ,
params->choose_time ? "[T]": "" ,
algo, // algorithm_name,
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str()
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
);
}
#endif
......
......@@ -3,10 +3,11 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0;
hash_prefix = std::string("GI| GPU#");
#section support_code_struct
use_cached = 0;
hash_prefix = std::string("GI|GPU#");
#line 12 "dnn_gi.c"
#section support_code_struct
#line 11 "dnn_gi.c"
int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
......@@ -102,7 +103,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
hashkey=dnn_conv_shape(APPLY_SPECIFIC(input), *input, APPLY_SPECIFIC(kerns), kerns, desc, output, groups);
if (hashkey.empty())
return 1;
hashkey = hash_prefix + pci_id + hashkey;
hashkey = hash_prefix + pci_id + " " + hashkey;
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
......@@ -111,6 +112,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
}
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
......@@ -140,15 +142,6 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
return 1;
}
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradinput algorithm found");
......@@ -161,6 +154,15 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else {
err = cudnnGetConvolutionBackwardDataAlgorithm(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output),
......@@ -258,13 +260,14 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
params->choose_algo ? "[A]": "" ,
params->choose_time ? "[T]": "" ,
algo, // algorithm_name,
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str()
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
);
}
#endif
......
......@@ -3,11 +3,11 @@ prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0;
hash_prefix = std::string("GW| GPU#");
use_cached = 0;
hash_prefix = std::string("GW|GPU#");
#section support_code_struct
#line 11 "dnn_gw.c"
int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
......@@ -96,6 +96,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
std::string hashkey ;
size_t free = c_get_largest_free_block_size(c);
if (PyErr_Occurred()) return 1;
cuda_enter(c->ctx);
......@@ -148,15 +149,6 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
return 1;
}
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG
if (count == 0) {
PyErr_SetString(PyExc_RuntimeError, "No best-timed conv gradweight algorithm found");
......@@ -169,6 +161,15 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} // Else, count is necessarly 1 for current implementation.
#endif
algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
#if CUDNN_MAJOR >= 7
prev_algo.mathType = mathtype = choice.mathType;
#endif
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
} else {
err = cudnnGetConvolutionBackwardFilterAlgorithm(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output),
......@@ -231,13 +232,14 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
params->choose_algo ? "[A]": "" ,
params->choose_time ? "[T]": "" ,
algo, // algorithm_name,
fprintf(stderr, "(using %s %s%s%s%s, ws:%ld, hash:%s)\n",
algorithm_name,
params->choose_time ? "(timed)": "" ,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str()
mathtype == CUDNN_TENSOR_OP_MATH ? "(tensor op)" : "",
worksize,
hashkey.c_str()
);
}
#endif
......
......@@ -399,7 +399,7 @@ class DnnBase(COp):
return []
def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 3)
return (super(DnnBase, self).c_code_cache_version(), version(), 1)
class GpuDnnConvDesc(COp):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论