提交 6117f98b authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: --global

add cudnnv3 conv3d

上级 34223240
......@@ -127,6 +127,30 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
""" % dict(var=var, err=err, desc=desc, fail=fail)
def c_set_tensorNd(var, nb_dim, desc, err, fail):
return """CudaNdarra
{
int* stride = CudaNdarray_HOST_STRIDES(%(var)s);
%(err)s = cudnnSetTensorNdDescriptorEx(
%(desc)s, nb_dim, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s),
CudaNdarray_HOST_STRIDES(%(var)s));
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"could not set tensorNd descriptor: %%s, %d"
cudnnGetErrorString(%(err)s),
nb_dim,
);
%(fail)s
}
}
""" % dict(var=var, nb_dim=nb_dim, err=err, desc=desc, fail=fail)
class DnnBase(GpuOp, COp):
"""
......@@ -359,10 +383,146 @@ AddConfigVar('dnn.conv.workmem_bwd',
EnumStr('deterministic', 'none', 'fft', 'guess'),
in_c_key=False)
class GpuDnnConv3dDesc(GpuOp):
"""This Op builds a 3d convolution descriptor for use in the other
3d convolution operations.
see the doc of :func:`dnn_conv` for a description of the parameters
"""
__props__ = ('border_mode', 'subsample', 'conv_mode')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_libraries(self):
return ['cudnn']
def c_compiler(self):
return NVCC_compiler
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
if isinstance(border_mode, int):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_d, pad_h, pad_w = map(int, border_mode)
border_mode = (pad_d, pad_h, pad_w)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", an integer or a pair of'
' integers'.format(border_mode))
self.border_mode = border_mode
assert len(subsample) == 3
self.subsample = subsample
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
def make_node(self, img_shape, kern_shape):
if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
raise TypeError('img must be 1D shape tensor')
if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
raise TypeError('kern must be 1D shape tensor')
return Apply(self, [img_shape, kern_shape],
[CDataType("cudnnConvolutionDescriptor_t")()])
def c_code(self, node, name, inputs, outputs, sub):
img_shape, kern_shape = inputs
desc, = outputs
if isinstance(self.border_mode, tuple):
pad_d_spec, pad_h_spec, pad_w_spec = map(int, self.border_mode)
assert pad_d_spec, pad_h_spec >= 0 and pad_w_spec >= 0
bmode = 2
else:
pad_d_spec = pad_h_spec = pad_w_spec = 0
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
return """
{
cudnnStatus_t err;
int pad[3];
if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
if (%(bmode)d == 2) {
pad[0] = %(pad_d_spec)d;
pad[1] = %(pad_h_spec)d;
pad[2] = %(pad_w_spec)d;
} else if (%(bmode)d == 1) {
pad[0] = 0;
pad[1] = 0;
pad[2] = 0;
} else if (%(bmode)d == 0) {
pad[0] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
pad[1] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
pad[2] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 4) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 30
int subsample[3];
subsample[0] = %(subsd)d;
subsample[1] = %(subsx)d;
subsample[2] = %(subsy)d;
int upscale[3] = {1, 1, 1};
err = cudnnSetConvolutionNdDescriptor(
%(desc)s,
3,
pad, subsample, upscale,
%(conv_flag)s
);
#else
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: CUDNN_VERSION must be >= 30");
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
subsd=self.subsample[0], subsx=self.subsample[1], subsy=self.subsample[2],
pad_d_spec=pad_d_spec, pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
def c_code_cache_version(self):
return None
# return (2, version())
# scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float32'))
_one = constant(numpy.asarray(1.0, dtype='float32'))
_ifour = constant(numpy.asarray(4, dtype='int32'))
_ifive = constant(numpy.asarray(5, dtype='int32'))
def ensure_float(val, default, name):
if val is None:
......@@ -377,6 +537,19 @@ def ensure_float(val, default, name):
raise TypeError("%s: type is not float32" % (name,))
return val
def ensure_int(val, default, name):
if val is None:
return default.clone()
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'int32':
raise TypeError("%s: type is not int32" % (name,))
return val
class GpuDnnConv(DnnBase, COp):
"""
......@@ -449,7 +622,7 @@ class GpuDnnConv(DnnBase, COp):
return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None, beta=None):
def make_node(self, img, kern, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output)
......@@ -466,12 +639,13 @@ class GpuDnnConv(DnnBase, COp):
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
return Apply(self, [img, kern, output, desc, alpha, beta],
return Apply(self, [img, kern, output, desc, alpha, beta, nb_dim],
[output.type()])
def grad(self, inp, grads):
img, kerns, output, desc, alpha, beta = inp
img, kerns, output, desc, alpha, beta, nb_dim = inp
top, = grads
top = gpu_contiguous(top)
......@@ -480,13 +654,14 @@ class GpuDnnConv(DnnBase, COp):
d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta]
DisconnectedType()(), d_alpha, d_beta, d_nb_dim]
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
return [[1], [1], [1], [0], [1], [1], [1]]
@staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample):
......@@ -523,6 +698,101 @@ class GpuDnnConv(DnnBase, COp):
return [shape[2]]
class GpuDnnConv3d(GpuDnnConv):
"""
The forward convolution.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('workmem', 'inplace')
__input_name__ = ('image', 'kernel', 'output',
'descriptor', 'alpha', 'beta')
def __init__(self, workmem=None, inplace=False):
"""
:param workmem: either 'none', 'small', 'large', 'fft', 'time' or
'guess'. Default is the value of :attr:`config.dnn.conv.workmem`.
"""
### Only workmem = 'none' work with cudnn conv 3d
super(GpuDnnConv3d, self).__init__(workmem='none', inplace=inplace)
def make_node(self, img, kern, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
if output.type.ndim != 5:
raise TypeError('output must be a 5D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
return Apply(self, [img, kern, output, desc, alpha, beta, nb_dim],
[output.type()])
def grad(self, inp, grads):
img, kerns, output, desc, alpha, beta, nb_dim = inp
top, = grads
top = gpu_contiguous(top)
d_img = GpuDnnConvGrad3dI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
d_kerns = GpuDnnConvGrad3dW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta, d_nb_dim]
@staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample):
"""
This function computes the output shape for a convolution with
the specified parameters. `ishape` and `kshape` can be symbolic
or scalar.
"""
b = ishape[0] # Number of inputs
d = ishape[2] # Depth of input feature maps
h = ishape[3] # Height of input feature maps
w = ishape[4] # Width of input feature maps
nb = kshape[0] # Number of output feature maps
kd = kshape[2] # Depth of each filter
kh = kshape[3] # Height of each filter
kw = kshape[4] # Width of each filter
sd, sh, sw = subsample
if border_mode == 'full':
padd = kd - 1
padh = kh - 1
padw = kw - 1
elif isinstance(border_mode, tuple):
padd, padh, padw = border_mode
else:
assert border_mode == 'valid'
padd = 0
padh = 0
padw = 0
return (
b, nb,
(d + 2*padd - kd)//sd + 1,
(h + 2*padh - kh)//sh + 1,
(w + 2*padw - kw)//sw + 1
)
class GpuDnnConvGradW(DnnBase, COp):
"""
The convolution gradient with respect to the weights.
......@@ -554,7 +824,7 @@ class GpuDnnConvGradW(DnnBase, COp):
self.inplace = False
def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp
img, top, output, desc, alpha, beta, nb_dim = inp
kerns, = grads
kerns = gpu_contiguous(kerns)
......@@ -563,13 +833,14 @@ class GpuDnnConvGradW(DnnBase, COp):
d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta)
DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
return [[1], [1], [1], [0], [1], [1], [1]]
def get_op_params(self):
if self.inplace:
......@@ -597,7 +868,7 @@ class GpuDnnConvGradW(DnnBase, COp):
return inplace_def + [alg_def, alg_choose_def]
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None):
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
......@@ -614,13 +885,70 @@ class GpuDnnConvGradW(DnnBase, COp):
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
return Apply(self, [img, topgrad, output, desc, alpha, beta],
return Apply(self, [img, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()])
def infer_shape(self, node, shape):
return [shape[2]]
class GpuDnnConv3dGradW(GpuDnnConvGradW):
"""
The convolution gradient with respect to the weights.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('workmem', 'inplace',)
__input_name__ = ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')
def __init__(self, inplace=False, workmem=None):
### Only workmem = 'none' work with cudnn conv 3d
super(GpuDnnConv3dGradW, self).__init(inplace=inplace, workmem='none')
def grad(self, inp, grads):
img, top, output, desc, alpha, beta, nb_dim = inp
kerns, = grads
kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGrad3dI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if output.type.ndim != 5:
raise TypeError('output must be 5D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
return Apply(self, [img, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()])
class GpuDnnConvGradI(DnnBase, COp):
"""
......@@ -652,7 +980,7 @@ class GpuDnnConvGradI(DnnBase, COp):
self.workmem = 'none'
def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta = inp
kerns, top, output, desc, alpha, beta, nb_dim = inp
img, = grads
img = gpu_contiguous(img)
......@@ -661,13 +989,14 @@ class GpuDnnConvGradI(DnnBase, COp):
d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta)
DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def connection_pattern(self, node):
# not connected to desc
return [[1], [1], [1], [0], [1], [1]]
return [[1], [1], [1], [0], [1], [1], [1]]
def get_op_params(self):
if self.inplace:
......@@ -695,7 +1024,7 @@ class GpuDnnConvGradI(DnnBase, COp):
return inplace_def + [alg_def, alg_choose_def]
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None):
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
......@@ -712,14 +1041,72 @@ class GpuDnnConvGradI(DnnBase, COp):
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
return Apply(self, [kern, topgrad, output, desc, alpha, beta],
return Apply(self, [kern, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()])
def infer_shape(self, node, shape):
return [shape[2]]
class GpuDnnConvGrad3dI(GpuDnnConvGradI):
"""
The convolution gradient with respect to the inputs.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('inplace',)
__input_name__ = ('kernel', 'grad', 'output',
'descriptor', 'alpha', 'beta')
def __init__(self, inplace=False):
super(GpuDnnConvGradI, self).__init__(inplace)
def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta, nb_dim = inp
img, = grads
img = gpu_contiguous(img)
d_kerns = GpuDnnConvGrad3dW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if output.type.ndim != 4:
raise TypeError('output must be 5D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
return Apply(self, [kern, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()])
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None):
"""
......@@ -804,6 +1191,179 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None):
"""
GPU convolution using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
:param img: images to do the convolution over
:param kerns: convolution filters
:param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from
:attr:`config.dnn.conv.workmem`)
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
elif (border_mode == 'full' and subsample == (1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(*out_shp)
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
conv_mode='conv', direction_hint=None, workmem=None):
"""
GPU convolution using cuDNN from NVIDIA.
The memory layout to use is 'bct01', that is 'batch', 'channel',
'first dim', 'second dim', 'third dim' in that order.
:param img: images to do the convolution over
:param kerns: convolution filters
:param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1,1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from
:attr:`config.dnn.conv.workmem`)
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
:warning: dnn_conv"d only works with cuDNN library 3.0
"""
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1, 1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3, shape3)
desc = GpuDnnConv3dDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode='cross')(img.shape, out.shape)
conv = GpuDnnConv3dGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3, 4))
elif (border_mode == 'full' and subsample == (1, 1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3, shape4)
desc = GpuDnnConv3dDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
return GpuDnnConv3dGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConv3dDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(*out_shp)
return GpuDnnConv3d(workmem=workmem)(img, kerns, out, desc)
class GpuDnnPoolDesc(GpuOp):
"""
This Op builds a pooling descriptor for use in the other
......
......@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
return 0;
}
static int
c_set_tensorNd(CudaNdarray *var, int dim, cudnnTensorDescriptor_t desc) {
int strides[dim];
for (int i = 0; i < dim; ++i)
{
if (CudaNdarray_HOST_STRIDES(var)[i])
strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
else
{
strides[i] = 1;
for (int j = i + 1; j < dim; ++j)
strides[i] *= CudaNdarray_HOST_DIMS(var)[j];
}
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var),
strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s"
"dim=%d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int
c_set_filterNd(CudaNdarray *var, int dim, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s."
" dims= %d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
......
......@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
and the algorithms, if any, that were selected according to these dimensions
and according to the amount of memory available at that time.
*/
int APPLY_SPECIFIC(previous_input_shape)[4];
int APPLY_SPECIFIC(previous_kerns_shape)[4];
int APPLY_SPECIFIC(previous_output_shape)[4];
int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[5];
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
......@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor "
PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL;
}
......@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns)))
FAIL;
}
for (int i = 0; i < 4; i++)
for (int i = 0; i < 5; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
......
......@@ -3,7 +3,8 @@
int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **output) {
float alpha, float beta, int nb_dim, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError,
......@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1;
/* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
#ifdef CONV_INPLACE
Py_XDECREF(*output);
*output = om;
Py_INCREF(*output);
#else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0)
if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1;
#endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_tensorNd(*output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1;
/* if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
{
size_t worksize;
void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo;
for (int i = 0; (i < nb_dim); i++)
std::cout << i << "/" << nb_dim << ", "
<< CudaNdarray_HOST_DIMS(input)[i] << ", "
<< CudaNdarray_HOST_DIMS(kerns)[i] << std::endl;
if (CHOOSE_ALGO)
{
// Check if the input and the kernels have the same shape as they have
// last time the apply node was executed
bool same_shapes = true;
for (int i = 0; (i < 4) && same_shapes; i++)
for (int i = 0; (i < nb_dim) && same_shapes; i++)
{
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
APPLY_SPECIFIC(previous_input_shape)[i]);
......@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use.
APPLY_SPECIFIC(previous_algo) = chosen_algo;
for (int i = 0; i < 4; i++)
for (int i = 0; i < nb_dim; i++)
{
APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i];
......@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it
// can't.
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT)
// Following code is 2d-specific, but it is fine as ftt is define only for 2d-filters
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
{
// Extract the properties of the convolution descriptor
......@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo,
&worksize);
if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here" << std::endl;
PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err));
cudnnGetErrorString(err));
return 1;
}
workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0)
return 1;
......@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
}
if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here2" << std::endl;
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err));
return 1;
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **input) {
float alpha, float beta, int nb_dim, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
......@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
return 1;
}
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
/* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1)
if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1;
#ifdef CONV_INPLACE
......@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
*input = im;
Py_INCREF(*input);
#else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0)
if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1;
#endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1)
/* if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
if (c_set_tensorNd(*input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1;
{
......
......@@ -3,7 +3,7 @@
int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **kerns) {
float alpha, float beta, int nb_dim, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
......@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
return 1;
}
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1)
/* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1)
if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1;
#ifdef CONV_INPLACE
......@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
*kerns = km;
Py_INCREF(*kerns);
#else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0)
if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1;
#endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1)
/* if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_filterNd(*kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1;
{
......
......@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
......@@ -763,6 +764,58 @@ def test_dnn_conv_grad():
utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_conv3d_valid():
print dnn.version()
if not cuda.dnn.dnn_available():
raise SkipTest('"3D conv not supported in cudnn v1')
def run_conv3d_valid(inputs_shape, filters_shape,
subsample=(1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
conv_ref = theano.tensor.nnet.conv3D(V=inputs.dimshuffle(0, 2, 3, 4, 1),
W=filters.dimshuffle(0, 2, 3, 4, 1),
b=bias, d=subsample)
conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
border_mode="valid", subsample=subsample, conv_mode='cross')
f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3))
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
print res_ref.shape, res.shape
utt.assert_allclose(res_ref, res)
run_conv3d_valid(inputs_shape=(128, 3, 5, 5, 5),
filters_shape=(64, 3, 1, 2, 4))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 2, 20, 12, 15),
filters_shape=(10, 2, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 2, 1))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(1, 2, 3))
def test_version():
if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论