提交 59dcaf9c authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Convert GpuDnnConv to v3 (and use the new facilities).

上级 88ed910a
...@@ -348,55 +348,97 @@ class GpuDnnConv(DnnBase, COp): ...@@ -348,55 +348,97 @@ class GpuDnnConv(DnnBase, COp):
kernel kernel
descr descr
The convolution descriptor. The convolution descriptor.
workmem algo : {'small', 'none', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Either 'none', 'small' or 'large'. Default is the value of Default is the value of :attr:`config.dnn.conv.algo_fwd`.
:attr:`config.dnn.conv.workmem`.
""" """
__props__ = ('workmem', 'inplace') __props__ = ('algo', 'inplace')
def __init__(self, workmem=None, inplace=False): def __init__(self, algo=None, inplace=False):
COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"], COp.__init__(self, ["dnn_base.c", "dnn_conv_base.c", "dnn_fwd.c"],
"APPLY_SPECIFIC(conv_fwd)") "APPLY_SPECIFIC(conv_fwd)")
if workmem is None:
workmem = config.dnn.conv.workmem if algo is None:
self.workmem = workmem algo = config.dnn.conv.algo_fwd
self.algo = algo
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
self.destroy_map = {0: [2]} self.destroy_map = {0: [2]}
assert self.workmem in ['none', 'small', 'large']
if version() < 3000:
if self.algo == 'fft':
raise RuntimeError("CuDNN FFT convolution requires CuDNN v3")
elif self.algo in ['guess_once', 'guess_on_shape_change']:
raise RuntimeError("CuDNN selection of convolution "
"implementation based on heuristics "
"requires CuDNN v3")
elif self.algo in ['time_once', 'time_on_shape_change']:
raise RuntimeError("CuDNN convolution timing requires CuDNN v3")
assert self.algo in ['none', 'small', 'large', 'fft', 'guess_once',
'guess_on_shape_change', 'time_once',
'time_on_shape_change']
def __setstate__(self, d):
self.__dict__.update(d)
if not hasattr(self, 'algo'):
if hasattr(self, 'workmem'):
self.algo = self.workmem
else:
self.algo = config.dnn.conv.algo_fwd
if not hasattr(self, 'inplace'):
self.inplace = False
def get_op_params(self): def get_op_params(self):
defs = []
if self.inplace: if self.inplace:
inpl_def = [('CONV_INPLACE', '1')] defs.append(('CONV_INPLACE', '1'))
else:
inpl_def = [] alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
if version() == -1: if self.algo == 'none':
alg_def = ('CONV_ALGO', "0") alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM'
else: elif self.algo == 'small':
if self.workmem == 'none': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM' elif self.algo == 'large':
elif self.workmem == 'small': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM' elif self.algo == 'fft':
elif self.workmem == 'large': alg = 'CUDNN_CONVOLUTION_FWD_ALGO_FFT'
alg = 'CUDNN_CONVOLUTION_FWD_ALGO_GEMM' defs.append(('CONV_ALGO', alg))
alg_def = ('CONV_ALGO', alg)
return [alg_def] + inpl_def if self.algo in ['guess_once', 'guess_on_shape_change',
'time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_ALGO', ''))
if self.algo in ['guess_once', 'time_once']:
defs.append(('CHOOSE_ONCE', ''))
if self.algo in ['time_once', 'time_on_shape_change']:
defs.append(('CHOOSE_TIME', ''))
return defs
def make_node(self, img, kern, output, desc, alpha=None, beta=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None):
img = as_gpuarray_variable(img) img = as_gpuarray_variable(img)
kern = as_gpuarray_variable(kern) kern = as_gpuarray_variable(kern)
output = as_gpuarray_variable(output) output = as_gpuarray_variable(output)
if img.type.ndim != 4: if img.type.ndim not in (4, 5):
raise TypeError('img must be 4D tensor') raise TypeError('img must be 4D or 5D tensor')
if kern.type.ndim != 4: if kern.type.ndim not in (4, 5):
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D or 5D tensor')
if output.type.ndim != 4: if output.type.ndim not in (4, 5):
raise TypeError('output must be a 4D tensor') raise TypeError('output must be a 4D or 5D tensor')
if not isinstance(desc.type, CDataType) \ if (img.type.ndim != kern.type.ndim or
or desc.type.ctype != 'cudnnConvolutionDescriptor_t': img.type.ndim != output.type.ndim):
raise TypeError("The number of dimensions of "
"img, kern and output must match")
if img.type.ndim == 5 and self.algo == 'fft':
raise ValueError("convolution algo fft can't be used for "
"3d convolutions")
if (not isinstance(desc.type, CDataType) or
desc.type.ctype != 'cudnnConvolutionDescriptor_t'):
raise TypeError('desc must be cudnnConvolutionDescriptor_t') raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_dt(alpha, _one, 'alpha', img.dtype) alpha = ensure_dt(alpha, _one, 'alpha', img.dtype)
...@@ -438,22 +480,41 @@ class GpuDnnConv(DnnBase, COp): ...@@ -438,22 +480,41 @@ class GpuDnnConv(DnnBase, COp):
kh = kshape[2] # Height of each filter kh = kshape[2] # Height of each filter
kw = kshape[3] # Width of each filter kw = kshape[3] # Width of each filter
sh, sw = subsample nd = len(subsample)
if nd > 2:
d = ishape[4]
kd = ishape[4]
sh = subsample[0]
sw = subsample[1]
if nd > 2:
sd = subsample[2]
if border_mode == 'full': if border_mode == 'full':
padh = kh - 1 padh = kh - 1
padw = kw - 1 padw = kw - 1
if nd > 4:
padd = kd - 1
elif isinstance(border_mode, tuple): elif isinstance(border_mode, tuple):
padh, padw = border_mode padh = border_mode[0]
padw = border_mode[1]
if nd > 2:
padd = border_mode[2]
else: else:
assert border_mode == 'valid' assert border_mode == 'valid'
padh = 0 padh = 0
padw = 0 padw = 0
padd = 0
res = [b, nb,
(h + 2 * padh - kh) // sh + 1,
(w + 2 * padw - kw) // sw + 1]
if nd > 2:
res.append(d + 2 * padd - kd // sd + 1)
return ( return res
b, nb,
(h + 2 * padh - kh) // sh + 1,
(w + 2 * padw - kw) // sw + 1
)
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[2]] return [shape[2]]
...@@ -607,7 +668,8 @@ class GpuDnnConvGradI(DnnBase): ...@@ -607,7 +668,8 @@ class GpuDnnConvGradI(DnnBase):
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None): conv_mode='conv', direction_hint=None, workmem=None,
algo=None):
""" """
GPU convolution using cuDNN from NVIDIA. GPU convolution using cuDNN from NVIDIA.
...@@ -631,19 +693,19 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -631,19 +693,19 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
direction_hint direction_hint
Used by graph optimizers to change algorithm choice. Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution. By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is If border_mode is 'valid', subsample is (1, 1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW. 'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1) and direction_hint is If border_mode is 'full', subsample is (1, 1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI. *not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned. removed at any time without a deprecation period. You have been warned.
workmem algo : {'none', 'small', 'large', 'fft', 'guess_once', 'guess_on_shape_change', 'time_once', 'time_on_shape_change'}
Specify the amount of working memory allowed. More memory is usually Convolution implementation to use. Some of its values may
faster. One of 'none', 'small' or 'large' (default is None which takes require certain versions of CuDNN to be installed. Default is
its value from :attr:`config.dnn.conv.workmem`). the value of :attr:`config.dnn.conv.algo_fwd`.
.. warning:: The cuDNN library only works with GPU that have a compute .. warning:: The cuDNN library only works with GPUs that have a compute
capability of 3.0 or higer. This means that older GPU will not capability of 3.0 or higer. This means that older GPUs will not
work with this Op. work with this Op.
""" """
...@@ -696,7 +758,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -696,7 +758,7 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out = GpuAllocEmpty(img.dtype)(*out_shp) out = GpuAllocEmpty(img.dtype)(*out_shp)
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
class GpuDnnPoolDesc(Op): class GpuDnnPoolDesc(Op):
...@@ -1563,7 +1625,7 @@ def local_dnn_conv_inplace(node): ...@@ -1563,7 +1625,7 @@ def local_dnn_conv_inplace(node):
isinstance(dest.owner.op, GpuAllocEmpty) and isinstance(dest.owner.op, GpuAllocEmpty) and
len(dest.clients) > 1): len(dest.clients) > 1):
inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs) inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
return [GpuDnnConv(workmem=node.op.workmem, inplace=True)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@local_optimizer([GpuDnnConvGradW], inplace=True) @local_optimizer([GpuDnnConvGradW], inplace=True)
...@@ -1604,7 +1666,7 @@ optdb.register('local_dnna_conv_inplace', ...@@ -1604,7 +1666,7 @@ optdb.register('local_dnna_conv_inplace',
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
if not dnn_available() or version() == -1: if not dnn_available() or version() == -1:
return None return None
return [GpuDnnConv(workmem=node.op.workmem)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
static cudnnHandle_t _handle = NULL; static cudnnHandle_t _handle = NULL;
static int static int
c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { c_set_tensorNd(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
cudnnDataType_t dt; cudnnDataType_t dt;
size_t ds; size_t ds;
switch (var->ga.typecode) { switch (var->ga.typecode) {
...@@ -12,26 +12,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) { ...@@ -12,26 +12,37 @@ c_set_tensor4d(PyGpuArrayObject *var, cudnnTensorDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#ifdef CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
#endif
default: default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensor4d"); PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_tensorNd");
return -1; return -1;
} }
ds = gpuarray_get_elsize(var->ga.typecode); ds = gpuarray_get_elsize(var->ga.typecode);
int str0, str1, str2, str3; int strs[5], dims[5], default_stride = 1;
// cudnn do not like 0s in strides unsigned int nd = PyGpuArray_NDIM(var);
str3 = PyGpuArray_STRIDES(var)[3]?PyGpuArray_STRIDES(var)[3]/ds:1;
str2 = PyGpuArray_STRIDES(var)[2]?PyGpuArray_STRIDES(var)[2]/ds:PyGpuArray_DIMS(var)[3]; if (nd > 5) {
str1 = PyGpuArray_STRIDES(var)[1]?PyGpuArray_STRIDES(var)[1]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]; PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
str0 = PyGpuArray_STRIDES(var)[0]?PyGpuArray_STRIDES(var)[0]/ds:PyGpuArray_DIMS(var)[2]*PyGpuArray_DIMS(var)[3]*PyGpuArray_DIMS(var)[1]; return -1;
cudnnStatus_t err = cudnnSetTensor4dDescriptorEx( }
desc, dt,
PyGpuArray_DIM(var, 0), PyGpuArray_DIM(var, 1), for (unsigned int _i = nd; _i > 0; _i--) {
PyGpuArray_DIM(var, 2), PyGpuArray_DIM(var, 3), unsigned int i = _i - 1;
str0, str1, str2, str3); strs[i] = PyGpuArray_STRIDE(var, i) ?
PyGpuArray_STRIDE(var, i)/ds : default_stride;
default_stride *= PyGpuArray_DIM(var, i);
dims[i] = PyGpuArray_DIM(var, i);
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, dt, nd, dims, strs);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set tensor4d descriptor: %s", "Could not set tensorNd descriptor: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return -1; return -1;
} }
...@@ -53,14 +64,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) { ...@@ -53,14 +64,30 @@ c_set_filter(PyGpuArrayObject *var, cudnnFilterDescriptor_t desc) {
case GA_DOUBLE: case GA_DOUBLE:
dt = CUDNN_DATA_DOUBLE; dt = CUDNN_DATA_DOUBLE;
break; break;
#ifdef CUDNN_VERSION > 3000
case GA_HALF:
dt = CUDNN_DATA_HALF;
break;
#endif
default: default:
PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter"); PyErr_SetString(PyExc_TypeError, "Non-float datatype in c_set_filter");
return -1; return -1;
} }
cudnnStatus_t err = cudnnSetFilter4dDescriptor(
desc, dt, int dims[5];
PyGpuArray_DIMS(var)[0], PyGpuArray_DIMS(var)[1], unsigned int nd = PyGpuArray_NDIM(var);
PyGpuArray_DIMS(var)[2], PyGpuArray_DIMS(var)[3]);
if (nd > 5) {
PyErr_SetString(PyExc_TypeError, "Tensor of more than 5d");
return -1;
}
for (unsigned int _i = nd; _i > 0; _i--) {
unsigned int i = _i - 1;
dims[i] = PyGpuArray_DIM(var, i);
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, dt, nd, dims);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s.", "Could not set filter descriptor: %s.",
......
...@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL; ...@@ -10,12 +10,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL; APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL; APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
......
...@@ -13,11 +13,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -13,11 +13,11 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) { if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuDnnConv images and kernel must have the same stack size"); "images and kernel must have the same stack size");
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
...@@ -28,6 +28,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -28,6 +28,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
beta_p = (void *)&beta; beta_p = (void *)&beta;
break; break;
case GA_FLOAT: case GA_FLOAT:
case GA_HALF:
alpha_p = (void *)&af; alpha_p = (void *)&af;
beta_p = (void *)&bf; beta_p = (void *)&bf;
break; break;
...@@ -49,29 +50,147 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -49,29 +50,147 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
cudnnConvolutionFwdAlgo_t algo = CONV_ALGO;
#ifdef CHOOSE_ALGO
/* Static variables are only initialized once so this will not
* reset the previous algo every time */
static int reuse_algo = 0;
static cudnnConvolutionFwdAlgo_t prev_algo = CONV_ALGO;
#ifndef CHOOSE_ONCE
static size_t prev_img_dims[5] = {0};
static size_t prev_kern_dims[5] = {0};
reuse_algo = 1;
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(input, i) == prev_img_dims[i]);
reuse_algo = (reuse_algo &&
PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]);
}
#endif
if (!reuse_algo) {
#ifdef CHOOSE_TIME
int count;
cudnnConvolutionFwdAlgoPerf_t choice;
err = cudnnFindConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output), 1, &count, &choice);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
algo = choice.algo;
#else
size_t free = 0, total = 0;
cudaError_t err2 = cudaMemGetInfo(&free, &total);
if (err2 != cudaSuccess) {
PyErr_Format(PyExc_RuntimeError, "Error when trying to find the "
"memory information on the GPU: %s\n",
cudaGetErrorString(err2));
return 1;
}
err = cudnnGetConvolutionForwardAlgorithm(
_handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns),
desc, APPLY_SPECIFIC(output),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error selecting convolution algo: %s",
cudnnGetErrorString(err));
return 1;
}
#endif
prev_algo = algo;
} else {
algo = prev_algo;
}
#ifdef CHOOSE_ONCE
reuse_algo = 1;
#else
for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) {
prev_img_dims[i] = PyGpuArray_DIM(input, i);
prev_kern_dims[i] = PyGpuArray_DIM(kerns, i);
}
#endif
#endif
/* These two algos are not supported for 3d conv */
if (PyGpuArray_NDIM(input) == 5 &&
(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM ||
algo == CUDNN_CONVOLUTION_FWD_ALGO_GEMM))
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
#if CUDNN_VERSION > 3000
if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) {
int nd;
int pad[2];
int stride[2];
int upscale[2];
cudnnConvolutionMode_t mode;
err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride,
upscale, &mode);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"error getting convolution properties: %s",
cudnnGetErrorString(err));
return 1;
}
if (stride[0] != 1 || stride[1] != 1 ||
PyGpuArray_DIM(input, 0) > 1024 || PyGpuArray_DIM(input, 1) > 1024 ||
(PyGpuArray_DIM(kerns, 0) == 1 && PyGpuArray_DIM(kerns, 1) == 1)) {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
}
#endif
#if CUDNN_VERSION < 3000
/* cuDNN before v3 does not support kernels larger than input even
* if appropriate padding is selected. */
for (unsigned int i = 2; i < PyGpuArray_NDIM(input); i++) {
if (PyGpuArray_DIM(kerns, i) > PyGpuArray_DIM(input, i)) {
PyErr_SetString(PyExc_RuntimeError, "the current version "
"of CuDNN does not support kernels larger than the "
"inputs in any spatial dimension, even if the inputs "
"are padded such that the padded inputs are larger "
"than the kernels. Update your installation of CuDNN "
"to V3 or more recent to solve the issue.");
return 1;
}
}
#endif
{ {
size_t worksize; size_t worksize;
gpudata *workspace; gpudata *workspace;
PyGpuContextObject *c; PyGpuContextObject *c;
err = cudnnGetConvolutionForwardWorkspaceSize(_handle, err = cudnnGetConvolutionForwardWorkspaceSize(_handle,
APPLY_SPECIFIC(input), APPLY_SPECIFIC(input),
APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(kerns),
desc, desc,
APPLY_SPECIFIC(output), APPLY_SPECIFIC(output),
CONV_ALGO, algo,
&worksize); &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s", "error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
/* /*
* This is less than ideal since we need to free it after (which * This is less than ideal since we need to free it after (which
* introduces a synchronization point. But we don't have a module * introduces a synchronization point. But we don't have a module
* to place a nice get_work_mem() function in. * to place a nice get_work_mem() function in.
...@@ -91,7 +210,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -91,7 +210,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
alpha_p, alpha_p,
APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input),
APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns),
desc, CONV_ALGO, desc, algo,
worksize == 0 ? NULL : *(void **)workspace, worksize, worksize == 0 ? NULL : *(void **)workspace, worksize,
beta_p, beta_p,
APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output)); APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output));
...@@ -101,7 +220,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, ...@@ -101,7 +220,7 @@ APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns,
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论