提交 bacc5f6f authored 作者: Boris Fomitchev's avatar Boris Fomitchev 提交者: notoraptor

Tensor op, cache

上级 2d3ab3b0
#section support_code
#include <cuda.h>
#include <sstream>
#include <vector>
#include <string>
#include "dnn_conv_find.h"
#if __cplusplus < 201103L
#include <tr1/unordered_map>
typedef std::tr1::unordered_map<std::string, AlgoRec> AlgoCache;
#else
#include <unordered_map>
typedef std::unordered_map<std::string, AlgoRec> AlgoCache;
#endif
#include "pthread.h"
#line 10 "dnn_conv_find.cc"
using std::vector;
using std::string;
pthread_mutex_t algoMutex;
AlgoCache algoCache;
static std::string shape(int* res, int size)
{
std::stringstream s;
if (size>0) {
s<<res[0];
for (int i=1; i< size; ++i)
s <<',' << res[i];
}
return std::string(s.str().c_str());
}
static std::string shape(cudnnTensorDescriptor_t t)
{
std::vector<int> res;
std::vector<int> stride;
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims,0,0));
res.resize(nbDims);
stride.resize(nbDims);
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int sizes = 8;
std::vector<int> res(sizes);
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
return shape(&res[0], outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{
const int maxDim = 5;
int nDim=0;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[maxDim];
int strideA[maxDim];
int dilationA[maxDim];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ));
return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim) + " -dilationA " + shape(dilationA, nDim);
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
{
size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
// there have to be entries for both aligned and not
if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
{
return false;
}
return true;
}
std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
cudnnFilterDescriptor_t filterDesc, void* filter,
cudnnConvolutionDescriptor_t convDesc,
void* out)
{
cudnnDataType_t dType;
std::stringstream s;
s << " -dimA" << shape(input) << " -filtA" << shape(filterDesc, &dType) << shape(convDesc);
// there have to be entries for both aligned and not
if (!all_aligned(dType, in, out, filter))
{
s << " [unaligned] ";
}
return std::string(s.str().c_str());
}
void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{
pthread_mutex_lock(&algoMutex);
algoCache[hash] = rec;
pthread_mutex_unlock(&algoMutex);
}
const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
pthread_mutex_lock(&algoMutex);
bool cacheHit = false;
const AlgoRec* ret = 0;
// cout << "dnn_conv_check_cache: "<< hash << endl;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
ret = &hit->second;
pthread_mutex_unlock(&algoMutex);
return ret;
}
#section support_code
#include <cuda.h>
#include <mutex>
#include <sstream>
#include <vector>
#include <string>
#include <unordered_map>
#include "dnn_conv_find.h"
#line 10 "dnn_conv_find.cc"
using std::vector;
using std::string;
using std::unique_lock;
using std::mutex;
typedef std::unordered_map<string, AlgoRec> AlgoCache;
mutex algoMutex;
AlgoCache algoCache;
static std::string shape(int* res, int size)
{
std::stringstream s;
if (size>0) {
s<<res[0];
for (int i=1; i< size; ++i)
s <<',' << res[i];
}
return std::string(s.str().c_str());
}
static std::string shape(cudnnTensorDescriptor_t t)
{
std::vector<int> res;
std::vector<int> stride;
int nbDims;
cudnnDataType_t type;
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, 0, &type, &nbDims, nullptr, nullptr));
res.resize(nbDims);
stride.resize(nbDims);
checkCudnnStatus(cudnnGetTensorNdDescriptor(t, nbDims, &type, &nbDims, res.data(), stride.data()));
return shape(&res[0], nbDims) + shape(&stride[0], nbDims);
};
static std::string shape(cudnnFilterDescriptor_t t, cudnnDataType_t* type)
{
cudnnTensorFormat_t format;
int sizes = 8;
std::vector<int> res(sizes);
int outDims;
checkCudnnStatus(cudnnGetFilterNdDescriptor(t, sizes, type, &format, &outDims, res.data()));
return shape(&res[0], outDims);
};
static std::string shape(cudnnConvolutionDescriptor_t convDesc)
{
const int maxDim = 5;
int nDim=0;
cudnnConvolutionMode_t mode;
cudnnDataType_t computeType;
int padA[maxDim];
int strideA[maxDim];
int dilationA[maxDim];
checkCudnnStatus(
cudnnGetConvolutionNdDescriptor( convDesc, maxDim,
&nDim,
&padA[0],
&strideA[0],
&dilationA[0],
&mode,
&computeType ));
return std::string("-mode ") + (((int)mode==0) ? "conv" : "corr") + " -padA" + shape(padA,nDim) + " -convStrideA " + shape(strideA, nDim) + " -dilationA " + shape(dilationA, nDim);
}
static bool all_aligned(cudnnDataType_t type, void* in, void* out, void* filter)
{
size_t alignMask = (type == CUDNN_DATA_HALF) ? 0x7F : 0xFF ;
// there have to be entries for both aligned and not
if (((size_t)in | (size_t)out | (size_t)filter) & alignMask)
{
return false;
}
return true;
}
std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
cudnnFilterDescriptor_t filterDesc, void* filter,
cudnnConvolutionDescriptor_t convDesc,
void* out)
{
cudnnDataType_t dType;
std::stringstream s;
s << " -dimA" << shape(input) << " -filtA" << shape(filterDesc, &dType) << shape(convDesc);
// there have to be entries for both aligned and not
if (!all_aligned(dType, in, out, filter))
{
s << " [unaligned] ";
}
return std::string(s.str().c_str());
}
void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec)
{
unique_lock<mutex> lock(algoMutex);
algoCache[hash] = rec;
}
const AlgoRec* dnn_conv_check_cache(const std::string& hash)
{
unique_lock<mutex> lock(algoMutex);
bool cacheHit = false;
// cout << "dnn_conv_check_cache: "<< hash << endl;
AlgoCache::iterator hit = algoCache.find(hash);
if (hit != algoCache.end())
return &hit->second;
return nullptr;
}
#pragma once
#include <string>
#include <cuda.h>
#include <cudnn.h>
#if CUDNN_MAJOR < 7
enum cudnnMathType_t { CUDNN_DEFAULT_MATH=0, CUDNN_TENSOR_OP_MATH = 1 };
#endif
inline void checkCudnnStatus(cudnnStatus_t err)
{
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "CUDNN Error: %s",
cudnnGetErrorString(err));
}
}
/* a common struct for all 3 CUDNN enums */
struct AlgoRec {
int algo;
cudnnDataType_t dataType;
size_t wsSize;
cudnnMathType_t mathType;
};
const AlgoRec* dnn_conv_check_cache(const std::string&);
std::string dnn_conv_shape(cudnnTensorDescriptor_t input, void* in,
cudnnFilterDescriptor_t filterDesc, void* filter,
cudnnConvolutionDescriptor_t convDesc,
void* out);
void dnn_conv_update_cache(const std::string& hash, const AlgoRec& rec);
#section init_code_struct #section init_code_struct
reuse_algo = 0; reuse_algo = 0;
prev_algo = PARAMS->conv_algo; prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
memset(prev_img_dims, 0, sizeof(prev_img_dims)); memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
#section support_code_struct #section support_code_struct
#line 12 "dnn_fwd.c"
int reuse_algo; #include "dnn_conv_find.h"
cudnnConvolutionFwdAlgo_t prev_algo; int reuse_algo;
size_t prev_img_dims[5]; bool use_cached;
size_t prev_kern_dims[5]; AlgoRec prev_algo;
size_t prev_img_dims[5];
size_t prev_kern_dims[5];
int int
APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
...@@ -84,12 +88,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -84,12 +88,16 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups;
cudnnConvolutionFwdAlgo_t algo = params->conv_algo; cudnnConvolutionFwdAlgo_t algo = params->conv_algo;
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey = "F| GPU#";
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
cuda_enter(c->ctx); cuda_enter(c->ctx);
if (params->choose_algo) { if (params->choose_algo) {
if (!params->choose_once) { if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
...@@ -100,10 +108,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -100,10 +108,26 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
} }
} }
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
hashkey += pci_id;
hashkey += dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, PyGpuArray_DEV_DATA(*output));
if (!reuse_algo) { if (!reuse_algo) {
// check out cache
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
if (!(reuse_algo || use_cached)) {
size_t free; size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
if (err2 != GA_NO_ERROR) { if (err2 != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
...@@ -125,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -125,6 +149,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory");
return -1; return -1;
} }
// We don't sync the buffer as we don't care about the values. // We don't sync the buffer as we don't care about the values.
err = cudnnFindConvolutionForwardAlgorithmEx( err = cudnnFindConvolutionForwardAlgorithmEx(
params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
...@@ -134,6 +159,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -134,6 +159,9 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
free); free);
gpudata_release(tmpmem); gpudata_release(tmpmem);
// fprintf(stderr, "(cudnnFindConvolutionForwardAlgorithmEx: (err:%d), algo: %d, mem: %ld, free: %ld\n",
// err, choice.algo, choice.memory, free);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s", "error selecting convolution algo: %s",
...@@ -142,6 +170,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -142,6 +170,12 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
algo = choice.algo; algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
prev_algo.mathType = mathtype = choice.mathType;
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
...@@ -167,31 +201,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -167,31 +201,17 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
// fprintf(stderr, "(cudnnGetConvolutionForwardAlgorithm: (err:%d), algo: %d\n", err, algo);
} }
prev_algo = algo; } else {
} else { algo = (cudnnConvolutionFwdAlgo_t)prev_algo.algo;
algo = prev_algo; worksize = prev_algo.wsSize;
mathtype = prev_algo.mathType;
} }
} else { /* choose_algo */
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
}
/* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */
if (PyGpuArray_NDIM(input) == 5 && if (PyGpuArray_NDIM(input) == 5 &&
...@@ -269,10 +289,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -269,10 +289,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
} }
} }
}/* choose_algo */
// if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{ {
size_t worksize;
gpudata *workspace;
err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, err = cudnnGetConvolutionForwardWorkspaceSize(params->handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
...@@ -309,7 +330,52 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -309,7 +330,52 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
if (params->choose_algo) {
if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name))
return 1;
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
params->choose_algo ? "[A]": "" ,
params->choose_time ? "[T]": "" ,
algo, // algorithm_name,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str()
);
}
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
}
{
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
...@@ -324,7 +390,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -324,7 +390,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
} }
} }
cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
...@@ -348,6 +414,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -348,6 +414,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ);
cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE);
} }
cuda_exit(c->ctx); cuda_exit(c->ctx);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
...@@ -357,3 +424,5 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -357,3 +424,5 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
return 0; return 0;
} }
#section init_code_struct #section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0; reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
memset(prev_kern_dims, 0, sizeof(prev_kern_dims)); memset(prev_kern_dims, 0, sizeof(prev_kern_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims)); memset(prev_top_dims, 0, sizeof(prev_top_dims));
#section support_code_struct #section support_code_struct
#include "dnn_conv_find.h"
int reuse_algo; #line 12 "dnn_gi.c"
cudnnConvolutionBwdDataAlgo_t prev_algo; int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
size_t prev_kern_dims[5]; size_t prev_kern_dims[5];
size_t prev_top_dims[5]; size_t prev_top_dims[5];
...@@ -86,6 +89,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -86,6 +89,8 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -104,7 +109,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -104,7 +109,7 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) { (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ld"
" but received gradient with shape %ldx%ldx%ldx%ld", " but received gradient with shape %dx%dx% dx%d",
expected_output_dims[0], expected_output_dims[1], expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3], expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1], PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
...@@ -131,6 +136,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -131,6 +136,10 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
std::string hashkey;
if (params->choose_algo) { if (params->choose_algo) {
if (!params->choose_once) { if (!params->choose_once) {
reuse_algo = 1; reuse_algo = 1;
...@@ -140,9 +149,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -140,9 +149,22 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
reuse_algo = (reuse_algo && reuse_algo = (reuse_algo &&
PyGpuArray_DIM(output, i) == prev_top_dims[i]); PyGpuArray_DIM(output, i) == prev_top_dims[i]);
} }
}
if (!reuse_algo) {
// check out cache
hashkey = std::string("GI | GPU#") + pci_id +
dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, PyGpuArray_DEV_DATA(output)
);
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
} }
if (!reuse_algo) { if (!(reuse_algo || use_cached)) {
size_t free; size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
...@@ -182,6 +204,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -182,6 +204,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
prev_algo.mathType = mathtype = choice.mathType;
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
...@@ -206,32 +234,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -206,32 +234,12 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
} }
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
} }
} } else { /*choose_algo */
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. The tiled-FFT implementation // with a spatial dimension larger than 1024. The tiled-FFT implementation
...@@ -279,9 +287,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -279,9 +287,11 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
} }
} }
} }
} /* choose_algo */
size_t worksize;
gpudata *workspace; // if FindEx was used (choose_time), workspace size is set.
if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionBackwardDataWorkspaceSize( err = cudnnGetConvolutionBackwardDataWorkspaceSize(
params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc,
...@@ -293,7 +303,47 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, ...@@ -293,7 +303,47 @@ APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
char algorithm_name[128];
if (0 != theano_enum_to_string_cudnnConvolutionBwdDataAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) {
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
......
#section init_code_struct #section init_code_struct
prev_algo.algo = PARAMS->conv_algo;
prev_algo.mathType = CUDNN_DEFAULT_MATH;
prev_algo.dataType = CUDNN_DATA_FLOAT;
reuse_algo = 0; reuse_algo = 0;
prev_algo = PARAMS->conv_algo;
memset(prev_img_dims, 0, sizeof(prev_img_dims)); memset(prev_img_dims, 0, sizeof(prev_img_dims));
memset(prev_top_dims, 0, sizeof(prev_top_dims)); memset(prev_top_dims, 0, sizeof(prev_top_dims));
#section support_code_struct #section support_code_struct
#line 12 "dnn_gw.c"
int reuse_algo; #include "dnn_conv_find.h"
cudnnConvolutionBwdFilterAlgo_t prev_algo; int reuse_algo;
bool use_cached;
AlgoRec prev_algo;
size_t prev_img_dims[5]; size_t prev_img_dims[5];
size_t prev_top_dims[5]; size_t prev_top_dims[5];
...@@ -87,6 +90,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -87,6 +90,10 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
#ifdef DEBUG #ifdef DEBUG
char algorithm_name[128]; char algorithm_name[128];
#endif #endif
size_t worksize = 0;
cudnnMathType_t mathtype = CUDNN_DEFAULT_MATH;
std::string hashkey = "GW | GPU#";
cuda_enter(c->ctx); cuda_enter(c->ctx);
...@@ -104,8 +111,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -104,8 +111,8 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
(PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) || (PyGpuArray_DIMS(output)[1] / params->num_groups != expected_output_dims[1]) ||
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) { (PyGpuArray_DIMS(output)[3] != expected_output_dims[3])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%dx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%d"
" but received gradient with shape %ldx%ldx%dx%ld", " but received gradient with shape %ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1], expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3], expected_output_dims[2], expected_output_dims[3],
PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1], PyGpuArray_DIMS(output)[0], PyGpuArray_DIMS(output)[1],
...@@ -119,7 +126,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -119,7 +126,7 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
(PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) || (PyGpuArray_DIMS(output)[2] != expected_output_dims[2]) ||
(PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) || (PyGpuArray_DIMS(output)[3] != expected_output_dims[3]) ||
(PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) { (PyGpuArray_DIMS(output)[4] != expected_output_dims[4])) {
PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %ldx%ldx%ldx%ldx%ld" PyErr_Format(PyExc_ValueError, "impossible convolution output dim: expected %dx%dx%dx%dx%d"
" but received gradient with shape %ldx%ldx%ldx%ldx%ld", " but received gradient with shape %ldx%ldx%ldx%ldx%ld",
expected_output_dims[0], expected_output_dims[1], expected_output_dims[0], expected_output_dims[1],
expected_output_dims[2], expected_output_dims[3], expected_output_dims[2], expected_output_dims[3],
...@@ -143,7 +150,26 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -143,7 +150,26 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
} }
if (!reuse_algo) { char pci_id[16];
gpucontext_property(c->ctx, GA_CTX_PROP_PCIBUSID, pci_id);
hashkey = pci_id;
hashkey = dnn_conv_shape(APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns),
desc, PyGpuArray_DEV_DATA(output)
);
if (!reuse_algo) {
// check out cache
const AlgoRec* cached = dnn_conv_check_cache(hashkey);
if (cached) {
prev_algo = *cached;
use_cached = 1;
}
}
if (!(reuse_algo || use_cached)) {
size_t free; size_t free;
int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free);
...@@ -184,6 +210,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -184,6 +210,12 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
} }
algo = choice.algo; algo = choice.algo;
prev_algo.algo = (int)algo;
prev_algo.wsSize = worksize = choice.memory;
prev_algo.mathType = mathtype = choice.mathType;
// Add to the cache
dnn_conv_update_cache(hashkey, prev_algo);
#ifdef DEBUG #ifdef DEBUG
if (count == 0) { if (count == 0) {
...@@ -209,32 +241,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -209,32 +241,16 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
prev_algo.algo = algo;
// no tensor_op returned from Get()
prev_algo.mathType = mathtype = CUDNN_DEFAULT_MATH;
} }
prev_algo = algo; } else {
} else { algo = (cudnnConvolutionBwdFilterAlgo_t)prev_algo.algo;
algo = prev_algo; worksize = prev_algo.wsSize;
} mathtype = prev_algo.mathType;
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
if (reuse_algo)
fprintf(stderr, "(reused %s)\n", algorithm_name);
else
fprintf(stderr, "(using %s)\n", algorithm_name);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
} }
} } else {
// The FFT implementation does not support strides, 1x1 filters or inputs // The FFT implementation does not support strides, 1x1 filters or inputs
// with a spatial dimension larger than 1024. // with a spatial dimension larger than 1024.
// If the chosen implementation is FFT, validate that it can // If the chosen implementation is FFT, validate that it can
...@@ -267,9 +283,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -267,9 +283,11 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
} }
} }
}/* choose_algo */
size_t worksize; // if FindEx was used (choose_time), workspace size is set.
gpudata *workspace; if (!(reuse_algo || use_cached || params->choose_time))
{
err = cudnnGetConvolutionBackwardFilterWorkspaceSize( err = cudnnGetConvolutionBackwardFilterWorkspaceSize(
params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc,
...@@ -281,7 +299,49 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, ...@@ -281,7 +299,49 @@ APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output,
cuda_exit(c->ctx); cuda_exit(c->ctx);
return 1; return 1;
} }
// save worksize for next time/cache
prev_algo.wsSize = worksize;
// Add to the cache
if (params->choose_algo)
dnn_conv_update_cache(hashkey, prev_algo);
}
#ifdef DEBUG
if (0 != theano_enum_to_string_cudnnConvolutionBwdFilterAlgo_t(algo, algorithm_name))
return 1;
// NB: This is printed only when algorithm is chosen at runtime.
fprintf(stderr, "%s%s algo: %d %s%s ws: %ld, tensor: %d hash:%s\n",
params->choose_algo ? "[A]": "" ,
params->choose_time ? "[T]": "" ,
algo, // algorithm_name,
reuse_algo ? "(reused)" : "",
use_cached ? "(cache)": "",
worksize, mathtype, hashkey.c_str()
);
#endif
if (params->choose_once) {
reuse_algo = 1;
} else {
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_top_dims[i] = PyGpuArray_DIM(output, i);
}
}
gpudata *workspace = 0;
#if CUDNN_MAJOR >= 7
// CUDNN7: need to set math type
err = cudnnSetConvolutionMathType(desc, prev_algo.mathType);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error setting math type for convolution : %s",
cudnnGetErrorString(err));
cuda_exit(c->ctx);
return 1;
}
#endif
if (worksize != 0) { if (worksize != 0) {
workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL);
if (workspace == NULL) { if (workspace == NULL) {
......
...@@ -399,7 +399,7 @@ class DnnBase(COp): ...@@ -399,7 +399,7 @@ class DnnBase(COp):
return [] return []
def c_code_cache_version(self): def c_code_cache_version(self):
return (super(DnnBase, self).c_code_cache_version(), version(), 1) return (super(DnnBase, self).c_code_cache_version(), version(), 2)
class GpuDnnConvDesc(COp): class GpuDnnConvDesc(COp):
...@@ -567,7 +567,7 @@ class GpuDnnConv(DnnBase): ...@@ -567,7 +567,7 @@ class GpuDnnConv(DnnBase):
num_groups=int_t) num_groups=int_t)
def __init__(self, algo=None, inplace=False, num_groups=1): def __init__(self, algo=None, inplace=False, num_groups=1):
DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_fwd.c"], DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)") "APPLY_SPECIFIC(conv_fwd)")
if algo is None: if algo is None:
...@@ -710,7 +710,7 @@ class GpuDnnConvGradW(DnnBase): ...@@ -710,7 +710,7 @@ class GpuDnnConvGradW(DnnBase):
num_groups=int_t) num_groups=int_t)
def __init__(self, inplace=False, algo=None, num_groups=1): def __init__(self, inplace=False, algo=None, num_groups=1):
DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gw.c"], DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_gw.c"],
"APPLY_SPECIFIC(conv_gw)") "APPLY_SPECIFIC(conv_gw)")
self.inplace = bool(inplace) self.inplace = bool(inplace)
if self.inplace: if self.inplace:
...@@ -846,7 +846,7 @@ class GpuDnnConvGradI(DnnBase): ...@@ -846,7 +846,7 @@ class GpuDnnConvGradI(DnnBase):
num_groups=int_t) num_groups=int_t)
def __init__(self, inplace=False, algo=None, num_groups=1): def __init__(self, inplace=False, algo=None, num_groups=1):
DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_gi.c"], DnnBase.__init__(self, ["c_code/dnn_conv_base.c", "c_code/dnn_conv_find.c", "c_code/dnn_gi.c"],
"APPLY_SPECIFIC(conv_gi)") "APPLY_SPECIFIC(conv_gi)")
self.inplace = bool(inplace) self.inplace = bool(inplace)
if self.inplace: if self.inplace:
......
...@@ -1180,15 +1180,19 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ...@@ -1180,15 +1180,19 @@ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) {
old = *base; old = *base;
do { do {
assumed = old; assumed = old;
sum = __float2half_rn( ga_half old_perm;
__HALF_TO_US(old_perm) = __byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
sum = __float2half_as_us(
__half2float(val) + __half2float(val) +
__half2float((ga_half)__byte_perm(old, 0, __half2float(old_perm));
((ga_size)addr & 2) ? 0x4432 : 0x4410)));
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_); old = atomicCAS(base, assumed, new_);
} while (assumed != old); } while (assumed != old);
return (ga_half)__byte_perm(old, 0, ga_half ret;
((ga_size)addr & 2) ? 0x4432 : 0x4410); __HALF_TO_US(ret) = __byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
return ret;
} }
__device__ ga_half atomicExch(ga_half *addr, ga_half val) { __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
...@@ -1197,13 +1201,14 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ...@@ -1197,13 +1201,14 @@ __device__ ga_half atomicExch(ga_half *addr, ga_half val) {
old = *base; old = *base;
do { do {
assumed = old; assumed = old;
new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254); new_ = __byte_perm(old, __HALF_TO_US(val), ((ga_size)addr & 2) ? 0x5410 : 0x3254);
old = atomicCAS(base, assumed, new_); old = atomicCAS(base, assumed, new_);
} while (assumed != old); } while (assumed != old);
return (ga_half)__byte_perm(old, 0, ga_half ret;
((ga_size)addr & 2) ? 0x4432 : 0x4410); __HALF_TO_US(ret) =__byte_perm(old, 0,
((ga_size)addr & 2) ? 0x4432 : 0x4410);
return ret;
} }
KERNEL void k_vector_add_fast(const ga_size numRowsX, KERNEL void k_vector_add_fast(const ga_size numRowsX,
const ga_size numColsX, const ga_size numColsX,
const ga_ssize stridesX0, const ga_ssize stridesX0,
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论