提交 6117f98b authored 作者: Nicolas Ballas's avatar Nicolas Ballas 提交者: --global

add cudnnv3 conv3d

上级 34223240
...@@ -127,6 +127,30 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) { ...@@ -127,6 +127,30 @@ if (%(err)s != CUDNN_STATUS_SUCCESS) {
""" % dict(var=var, err=err, desc=desc, fail=fail) """ % dict(var=var, err=err, desc=desc, fail=fail)
def c_set_tensorNd(var, nb_dim, desc, err, fail):
return """CudaNdarra
{
int* stride = CudaNdarray_HOST_STRIDES(%(var)s);
%(err)s = cudnnSetTensorNdDescriptorEx(
%(desc)s, nb_dim, CUDNN_DATA_FLOAT,
CudaNdarray_HOST_DIMS(%(var)s),
CudaNdarray_HOST_STRIDES(%(var)s));
);
if (%(err)s != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"could not set tensorNd descriptor: %%s, %d"
cudnnGetErrorString(%(err)s),
nb_dim,
);
%(fail)s
}
}
""" % dict(var=var, nb_dim=nb_dim, err=err, desc=desc, fail=fail)
class DnnBase(GpuOp, COp): class DnnBase(GpuOp, COp):
""" """
...@@ -359,10 +383,146 @@ AddConfigVar('dnn.conv.workmem_bwd', ...@@ -359,10 +383,146 @@ AddConfigVar('dnn.conv.workmem_bwd',
EnumStr('deterministic', 'none', 'fft', 'guess'), EnumStr('deterministic', 'none', 'fft', 'guess'),
in_c_key=False) in_c_key=False)
class GpuDnnConv3dDesc(GpuOp):
"""This Op builds a 3d convolution descriptor for use in the other
3d convolution operations.
see the doc of :func:`dnn_conv` for a description of the parameters
"""
__props__ = ('border_mode', 'subsample', 'conv_mode')
def c_headers(self):
return ['cudnn.h', 'cudnn_helper.h']
def c_header_dirs(self):
return [os.path.dirname(__file__)]
def c_libraries(self):
return ['cudnn']
def c_compiler(self):
return NVCC_compiler
def __init__(self, border_mode, subsample=(1, 1), conv_mode='conv'):
if isinstance(border_mode, int):
border_mode = (border_mode, border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_d, pad_h, pad_w = map(int, border_mode)
border_mode = (pad_d, pad_h, pad_w)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", an integer or a pair of'
' integers'.format(border_mode))
self.border_mode = border_mode
assert len(subsample) == 3
self.subsample = subsample
assert conv_mode in ('conv', 'cross')
self.conv_mode = conv_mode
def make_node(self, img_shape, kern_shape):
if img_shape.type.ndim != 1 or img_shape.type.dtype != 'int64':
raise TypeError('img must be 1D shape tensor')
if kern_shape.type.ndim != 1 or kern_shape.type.dtype != 'int64':
raise TypeError('kern must be 1D shape tensor')
return Apply(self, [img_shape, kern_shape],
[CDataType("cudnnConvolutionDescriptor_t")()])
def c_code(self, node, name, inputs, outputs, sub):
img_shape, kern_shape = inputs
desc, = outputs
if isinstance(self.border_mode, tuple):
pad_d_spec, pad_h_spec, pad_w_spec = map(int, self.border_mode)
assert pad_d_spec, pad_h_spec >= 0 and pad_w_spec >= 0
bmode = 2
else:
pad_d_spec = pad_h_spec = pad_w_spec = 0
if self.border_mode == "valid":
bmode = 1
else:
assert self.border_mode == "full"
bmode = 0
if self.conv_mode == 'conv':
conv_flag = 'CUDNN_CONVOLUTION'
else:
conv_flag = 'CUDNN_CROSS_CORRELATION'
return """
{
cudnnStatus_t err;
int pad[3];
if ((err = cudnnCreateConvolutionDescriptor(&%(desc)s)) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate convolution "
"descriptor: %%s", cudnnGetErrorString(err));
%(fail)s
}
if (%(bmode)d == 2) {
pad[0] = %(pad_d_spec)d;
pad[1] = %(pad_h_spec)d;
pad[2] = %(pad_w_spec)d;
} else if (%(bmode)d == 1) {
pad[0] = 0;
pad[1] = 0;
pad[2] = 0;
} else if (%(bmode)d == 0) {
pad[0] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 2) - 1;
pad[1] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 3) - 1;
pad[2] = *(npy_int64 *)PyArray_GETPTR1(%(kern_shape)s, 4) - 1;
} else {
PyErr_SetString(PyExc_ValueError, "bad border mode");
%(fail)s
}
#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 30
int subsample[3];
subsample[0] = %(subsd)d;
subsample[1] = %(subsx)d;
subsample[2] = %(subsy)d;
int upscale[3] = {1, 1, 1};
err = cudnnSetConvolutionNdDescriptor(
%(desc)s,
3,
pad, subsample, upscale,
%(conv_flag)s
);
#else
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: CUDNN_VERSION must be >= 30");
#endif
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError, "could not set op descriptor: %%s",
cudnnGetErrorString(err));
%(fail)s
}
}
""" % dict(name=name, img_shape=img_shape, kern_shape=kern_shape, desc=desc,
bmode=bmode, conv_flag=conv_flag, fail=sub['fail'],
subsd=self.subsample[0], subsx=self.subsample[1], subsy=self.subsample[2],
pad_d_spec=pad_d_spec, pad_h_spec=pad_h_spec, pad_w_spec=pad_w_spec)
def c_code_cache_version(self):
return None
# return (2, version())
# scalar constants # scalar constants
_zero = constant(numpy.asarray(0.0, dtype='float32')) _zero = constant(numpy.asarray(0.0, dtype='float32'))
_one = constant(numpy.asarray(1.0, dtype='float32')) _one = constant(numpy.asarray(1.0, dtype='float32'))
_ifour = constant(numpy.asarray(4, dtype='int32'))
_ifive = constant(numpy.asarray(5, dtype='int32'))
def ensure_float(val, default, name): def ensure_float(val, default, name):
if val is None: if val is None:
...@@ -377,6 +537,19 @@ def ensure_float(val, default, name): ...@@ -377,6 +537,19 @@ def ensure_float(val, default, name):
raise TypeError("%s: type is not float32" % (name,)) raise TypeError("%s: type is not float32" % (name,))
return val return val
def ensure_int(val, default, name):
if val is None:
return default.clone()
if not isinstance(val, Variable):
val = constant(val)
if hasattr(val, 'ndim') and val.ndim == 0:
val = as_scalar(val)
if not isinstance(val.type, theano.scalar.Scalar):
raise TypeError("%s: expected a scalar value" % (name,))
if not val.type.dtype == 'int32':
raise TypeError("%s: type is not int32" % (name,))
return val
class GpuDnnConv(DnnBase, COp): class GpuDnnConv(DnnBase, COp):
""" """
...@@ -449,7 +622,7 @@ class GpuDnnConv(DnnBase, COp): ...@@ -449,7 +622,7 @@ class GpuDnnConv(DnnBase, COp):
return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def return [alg_def, alg_choose_def, alg_choose_time_def] + inpl_def
def make_node(self, img, kern, output, desc, alpha=None, beta=None): def make_node(self, img, kern, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -466,12 +639,13 @@ class GpuDnnConv(DnnBase, COp): ...@@ -466,12 +639,13 @@ class GpuDnnConv(DnnBase, COp):
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta') beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
return Apply(self, [img, kern, output, desc, alpha, beta], return Apply(self, [img, kern, output, desc, alpha, beta, nb_dim],
[output.type()]) [output.type()])
def grad(self, inp, grads): def grad(self, inp, grads):
img, kerns, output, desc, alpha, beta = inp img, kerns, output, desc, alpha, beta, nb_dim = inp
top, = grads top, = grads
top = gpu_contiguous(top) top = gpu_contiguous(top)
...@@ -480,13 +654,14 @@ class GpuDnnConv(DnnBase, COp): ...@@ -480,13 +654,14 @@ class GpuDnnConv(DnnBase, COp):
d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc) d_kerns = GpuDnnConvGradW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return [d_img * alpha, d_kerns * alpha, top * beta, return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta] DisconnectedType()(), d_alpha, d_beta, d_nb_dim]
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1], [1]]
@staticmethod @staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample): def get_out_shape(ishape, kshape, border_mode, subsample):
...@@ -523,6 +698,101 @@ class GpuDnnConv(DnnBase, COp): ...@@ -523,6 +698,101 @@ class GpuDnnConv(DnnBase, COp):
return [shape[2]] return [shape[2]]
class GpuDnnConv3d(GpuDnnConv):
"""
The forward convolution.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('workmem', 'inplace')
__input_name__ = ('image', 'kernel', 'output',
'descriptor', 'alpha', 'beta')
def __init__(self, workmem=None, inplace=False):
"""
:param workmem: either 'none', 'small', 'large', 'fft', 'time' or
'guess'. Default is the value of :attr:`config.dnn.conv.workmem`.
"""
### Only workmem = 'none' work with cudnn conv 3d
super(GpuDnnConv3d, self).__init__(workmem='none', inplace=inplace)
def make_node(self, img, kern, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img)
kern = as_cuda_ndarray_variable(kern)
output = as_cuda_ndarray_variable(output)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
if output.type.ndim != 5:
raise TypeError('output must be a 5D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
return Apply(self, [img, kern, output, desc, alpha, beta, nb_dim],
[output.type()])
def grad(self, inp, grads):
img, kerns, output, desc, alpha, beta, nb_dim = inp
top, = grads
top = gpu_contiguous(top)
d_img = GpuDnnConvGrad3dI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
d_kerns = GpuDnnConvGrad3dW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return [d_img * alpha, d_kerns * alpha, top * beta,
DisconnectedType()(), d_alpha, d_beta, d_nb_dim]
@staticmethod
def get_out_shape(ishape, kshape, border_mode, subsample):
"""
This function computes the output shape for a convolution with
the specified parameters. `ishape` and `kshape` can be symbolic
or scalar.
"""
b = ishape[0] # Number of inputs
d = ishape[2] # Depth of input feature maps
h = ishape[3] # Height of input feature maps
w = ishape[4] # Width of input feature maps
nb = kshape[0] # Number of output feature maps
kd = kshape[2] # Depth of each filter
kh = kshape[3] # Height of each filter
kw = kshape[4] # Width of each filter
sd, sh, sw = subsample
if border_mode == 'full':
padd = kd - 1
padh = kh - 1
padw = kw - 1
elif isinstance(border_mode, tuple):
padd, padh, padw = border_mode
else:
assert border_mode == 'valid'
padd = 0
padh = 0
padw = 0
return (
b, nb,
(d + 2*padd - kd)//sd + 1,
(h + 2*padh - kh)//sh + 1,
(w + 2*padw - kw)//sw + 1
)
class GpuDnnConvGradW(DnnBase, COp): class GpuDnnConvGradW(DnnBase, COp):
""" """
The convolution gradient with respect to the weights. The convolution gradient with respect to the weights.
...@@ -554,7 +824,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -554,7 +824,7 @@ class GpuDnnConvGradW(DnnBase, COp):
self.inplace = False self.inplace = False
def grad(self, inp, grads): def grad(self, inp, grads):
img, top, output, desc, alpha, beta = inp img, top, output, desc, alpha, beta, nb_dim = inp
kerns, = grads kerns, = grads
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
...@@ -563,13 +833,14 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -563,13 +833,14 @@ class GpuDnnConvGradW(DnnBase, COp):
d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc) d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_img * alpha, d_top * alpha, kerns * beta, return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta) DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1], [1]]
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
...@@ -597,7 +868,7 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -597,7 +868,7 @@ class GpuDnnConvGradW(DnnBase, COp):
return inplace_def + [alg_def, alg_choose_def] return inplace_def + [alg_def, alg_choose_def]
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None): def make_node(self, img, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img) img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -614,13 +885,70 @@ class GpuDnnConvGradW(DnnBase, COp): ...@@ -614,13 +885,70 @@ class GpuDnnConvGradW(DnnBase, COp):
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta') beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
return Apply(self, [img, topgrad, output, desc, alpha, beta], return Apply(self, [img, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[2]] return [shape[2]]
class GpuDnnConv3dGradW(GpuDnnConvGradW):
"""
The convolution gradient with respect to the weights.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('workmem', 'inplace',)
__input_name__ = ('image', 'grad', 'output', 'descriptor', 'alpha', 'beta')
def __init__(self, inplace=False, workmem=None):
### Only workmem = 'none' work with cudnn conv 3d
super(GpuDnnConv3dGradW, self).__init(inplace=inplace, workmem='none')
def grad(self, inp, grads):
img, top, output, desc, alpha, beta, nb_dim = inp
kerns, = grads
kerns = gpu_contiguous(kerns)
d_img = GpuDnnConvGrad3dI()(kerns, top, gpu_alloc_empty(*img.shape), desc)
d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_img * alpha, d_top * alpha, kerns * beta,
DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def make_node(self, img, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
img = as_cuda_ndarray_variable(img)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
if img.type.ndim != 5:
raise TypeError('img must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if output.type.ndim != 5:
raise TypeError('output must be 5D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
return Apply(self, [img, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()])
class GpuDnnConvGradI(DnnBase, COp): class GpuDnnConvGradI(DnnBase, COp):
""" """
...@@ -652,7 +980,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -652,7 +980,7 @@ class GpuDnnConvGradI(DnnBase, COp):
self.workmem = 'none' self.workmem = 'none'
def grad(self, inp, grads): def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta = inp kerns, top, output, desc, alpha, beta, nb_dim = inp
img, = grads img, = grads
img = gpu_contiguous(img) img = gpu_contiguous(img)
...@@ -661,13 +989,14 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -661,13 +989,14 @@ class GpuDnnConvGradI(DnnBase, COp):
d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc) d_top = GpuDnnConv()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_kerns * alpha, d_top * alpha, img * beta, return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta) DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def connection_pattern(self, node): def connection_pattern(self, node):
# not connected to desc # not connected to desc
return [[1], [1], [1], [0], [1], [1]] return [[1], [1], [1], [0], [1], [1], [1]]
def get_op_params(self): def get_op_params(self):
if self.inplace: if self.inplace:
...@@ -695,7 +1024,7 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -695,7 +1024,7 @@ class GpuDnnConvGradI(DnnBase, COp):
return inplace_def + [alg_def, alg_choose_def] return inplace_def + [alg_def, alg_choose_def]
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None): def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
kern = as_cuda_ndarray_variable(kern) kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad) topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output) output = as_cuda_ndarray_variable(output)
...@@ -712,14 +1041,72 @@ class GpuDnnConvGradI(DnnBase, COp): ...@@ -712,14 +1041,72 @@ class GpuDnnConvGradI(DnnBase, COp):
alpha = ensure_float(alpha, _one, 'alpha') alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta') beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifour, 'nb_dim')
return Apply(self, [kern, topgrad, output, desc, alpha, beta], return Apply(self, [kern, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()]) [output.type()])
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
return [shape[2]] return [shape[2]]
class GpuDnnConvGrad3dI(GpuDnnConvGradI):
"""
The convolution gradient with respect to the inputs.
:param image:
:param kernel:
:param descr: the convolution descriptor
"""
__props__ = ('inplace',)
__input_name__ = ('kernel', 'grad', 'output',
'descriptor', 'alpha', 'beta')
def __init__(self, inplace=False):
super(GpuDnnConvGradI, self).__init__(inplace)
def grad(self, inp, grads):
kerns, top, output, desc, alpha, beta, nb_dim = inp
img, = grads
img = gpu_contiguous(img)
d_kerns = GpuDnnConvGrad3dW()(img, top, gpu_alloc_empty(*kerns.shape), desc)
d_top = GpuDnnConv3d()(img, kerns, gpu_alloc_empty(*top.shape), desc)
d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta)
d_nb_dim = grad_not_implemented(self, 6, nb_dim)
return (d_kerns * alpha, d_top * alpha, img * beta,
DisconnectedType()(), d_alpha, d_beta, d_nb_dim)
def make_node(self, kern, topgrad, output, desc, alpha=None, beta=None, nb_dim=None):
kern = as_cuda_ndarray_variable(kern)
topgrad = as_cuda_ndarray_variable(topgrad)
output = as_cuda_ndarray_variable(output)
if kern.type.ndim != 5:
raise TypeError('kern must be 5D tensor')
if topgrad.type.ndim != 5:
raise TypeError('topgrad must be 5D tensor')
if output.type.ndim != 4:
raise TypeError('output must be 5D tensor')
if not isinstance(desc.type, CDataType) \
or desc.type.ctype != 'cudnnConvolutionDescriptor_t':
raise TypeError('desc must be cudnnConvolutionDescriptor_t')
alpha = ensure_float(alpha, _one, 'alpha')
beta = ensure_float(beta, _zero, 'beta')
nb_dim = ensure_int(nb_dim, _ifive, 'nb_dim')
return Apply(self, [kern, topgrad, output, desc, alpha, beta, nb_dim],
[output.type()])
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None): conv_mode='conv', direction_hint=None, workmem=None):
""" """
...@@ -804,6 +1191,179 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -804,6 +1191,179 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc) return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None):
"""
GPU convolution using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
:param img: images to do the convolution over
:param kerns: convolution filters
:param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from
:attr:`config.dnn.conv.workmem`)
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross')(img.shape, out.shape)
conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3))
elif (border_mode == 'full' and subsample == (1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(*out_shp)
return GpuDnnConv(workmem=workmem)(img, kerns, out, desc)
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
conv_mode='conv', direction_hint=None, workmem=None):
"""
GPU convolution using cuDNN from NVIDIA.
The memory layout to use is 'bct01', that is 'batch', 'channel',
'first dim', 'second dim', 'third dim' in that order.
:param img: images to do the convolution over
:param kerns: convolution filters
:param border_mode: one of 'valid', 'full'; additionally, the padding size
could be directly specified by an integer or a pair of integers
:param subsample: perform subsampling of the output (default: (1, 1, 1))
:param conv_mode: perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross'. (default: 'conv')
:param direction_hint: Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1,1,1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1,1,1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
:param workmem: Specify the amount of working memory allowed.
More memory is usually faster. One of 'none', 'small' or
'large'. (default is None which takes its value from
:attr:`config.dnn.conv.workmem`)
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
:warning: dnn_conv"d only works with cuDNN library 3.0
"""
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
if (border_mode == 'valid' and subsample == (1, 1, 1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
out = gpu_alloc_empty(shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3, shape3)
desc = GpuDnnConv3dDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode='cross')(img.shape, out.shape)
conv = GpuDnnConv3dGradW()(img, kerns, out, desc)
return as_cuda_ndarray_variable(conv.dimshuffle(1, 0, 2, 3, 4))
elif (border_mode == 'full' and subsample == (1, 1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v1 and v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
out = gpu_alloc_empty(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph), shape2, shape3, shape4)
desc = GpuDnnConv3dDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode=conv_mode)(out.shape, kerns.shape)
return GpuDnnConv3dGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = GpuDnnConv3dDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns.shape)
desc_op = desc.owner.op
out_shp = GpuDnnConv3d.get_out_shape(img.shape, kerns.shape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(*out_shp)
return GpuDnnConv3d(workmem=workmem)(img, kerns, out, desc)
class GpuDnnPoolDesc(GpuOp): class GpuDnnPoolDesc(GpuOp):
""" """
This Op builds a pooling descriptor for use in the other This Op builds a pooling descriptor for use in the other
......
...@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) { ...@@ -33,6 +33,60 @@ c_set_tensor4d(CudaNdarray *var, cudnnTensorDescriptor_t desc) {
return 0; return 0;
} }
static int
c_set_tensorNd(CudaNdarray *var, int dim, cudnnTensorDescriptor_t desc) {
int strides[dim];
for (int i = 0; i < dim; ++i)
{
if (CudaNdarray_HOST_STRIDES(var)[i])
strides[i] = CudaNdarray_HOST_STRIDES(var)[i];
else
{
strides[i] = 1;
for (int j = i + 1; j < dim; ++j)
strides[i] *= CudaNdarray_HOST_DIMS(var)[j];
}
}
cudnnStatus_t err = cudnnSetTensorNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var),
strides);
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set tensorNd descriptor: %s"
"dim=%d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int
c_set_filterNd(CudaNdarray *var, int dim, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) {
PyErr_SetString(PyExc_ValueError,
"Only contiguous filters (kernels) are supported.");
return -1;
}
cudnnStatus_t err = cudnnSetFilterNdDescriptor(desc, CUDNN_DATA_FLOAT, dim,
CudaNdarray_HOST_DIMS(var));
if (err != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_RuntimeError,
"Could not set filter descriptor: %s."
" dims= %d",
cudnnGetErrorString(err), dim);
return -1;
}
return 0;
}
static int static int
c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) { c_set_filter(CudaNdarray *var, cudnnFilterDescriptor_t desc) {
if (!CudaNdarray_is_c_contiguous(var)) { if (!CudaNdarray_is_c_contiguous(var)) {
......
...@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns); ...@@ -7,9 +7,9 @@ cudnnFilterDescriptor_t APPLY_SPECIFIC(kerns);
and the algorithms, if any, that were selected according to these dimensions and the algorithms, if any, that were selected according to these dimensions
and according to the amount of memory available at that time. and according to the amount of memory available at that time.
*/ */
int APPLY_SPECIFIC(previous_input_shape)[4]; int APPLY_SPECIFIC(previous_input_shape)[5];
int APPLY_SPECIFIC(previous_kerns_shape)[4]; int APPLY_SPECIFIC(previous_kerns_shape)[5];
int APPLY_SPECIFIC(previous_output_shape)[4]; int APPLY_SPECIFIC(previous_output_shape)[5];
cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo); cudnnConvolutionFwdAlgo_t APPLY_SPECIFIC(previous_algo);
cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo); cudnnConvolutionBwdFilterAlgo_t APPLY_SPECIFIC(previous_bwd_f_algo);
cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo); cudnnConvolutionBwdDataAlgo_t APPLY_SPECIFIC(previous_bwd_d_algo);
...@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL; ...@@ -21,12 +21,12 @@ APPLY_SPECIFIC(input) = NULL;
APPLY_SPECIFIC(output) = NULL; APPLY_SPECIFIC(output) = NULL;
APPLY_SPECIFIC(kerns) = NULL; APPLY_SPECIFIC(kerns) = NULL;
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(input))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(inp): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) { if ((APPLY_SPECIFIC(err) = cudnnCreateTensorDescriptor(&APPLY_SPECIFIC(output))) != CUDNN_STATUS_SUCCESS) {
PyErr_Format(PyExc_MemoryError, "could not allocate tensor4d descriptor " PyErr_Format(PyExc_MemoryError, "could not allocate tensor descriptor "
"(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err))); "(out): %s", cudnnGetErrorString(APPLY_SPECIFIC(err)));
FAIL; FAIL;
} }
...@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns))) ...@@ -36,7 +36,7 @@ if ((APPLY_SPECIFIC(err) = cudnnCreateFilterDescriptor(&APPLY_SPECIFIC(kerns)))
FAIL; FAIL;
} }
for (int i = 0; i < 4; i++) for (int i = 0; i < 5; i++)
{ {
APPLY_SPECIFIC(previous_input_shape)[i] = 0; APPLY_SPECIFIC(previous_input_shape)[i] = 0;
APPLY_SPECIFIC(previous_kerns_shape)[i] = 0; APPLY_SPECIFIC(previous_kerns_shape)[i] = 0;
......
...@@ -3,7 +3,8 @@ ...@@ -3,7 +3,8 @@
int int
APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
CudaNdarray *om, cudnnConvolutionDescriptor_t desc, CudaNdarray *om, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **output) { float alpha, float beta, int nb_dim, CudaNdarray **output) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -11,37 +12,49 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
/* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
Py_XDECREF(*output); Py_XDECREF(*output);
*output = om; *output = om;
Py_INCREF(*output); Py_INCREF(*output);
#else #else
if (CudaNdarray_prep_output(output, 4, CudaNdarray_HOST_DIMS(om)) != 0) if (CudaNdarray_prep_output(output, nb_dim, CudaNdarray_HOST_DIMS(om)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*output, om))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(*output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
/* if (c_set_tensor4d(*output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
{ {
size_t worksize; size_t worksize;
void *workspace; void *workspace;
cudnnConvolutionFwdAlgo_t chosen_algo; cudnnConvolutionFwdAlgo_t chosen_algo;
for (int i = 0; (i < nb_dim); i++)
std::cout << i << "/" << nb_dim << ", "
<< CudaNdarray_HOST_DIMS(input)[i] << ", "
<< CudaNdarray_HOST_DIMS(kerns)[i] << std::endl;
if (CHOOSE_ALGO) if (CHOOSE_ALGO)
{ {
// Check if the input and the kernels have the same shape as they have // Check if the input and the kernels have the same shape as they have
// last time the apply node was executed // last time the apply node was executed
bool same_shapes = true; bool same_shapes = true;
for (int i = 0; (i < 4) && same_shapes; i++) for (int i = 0; (i < nb_dim) && same_shapes; i++)
{ {
same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] != same_shapes &= (CudaNdarray_HOST_DIMS(input)[i] !=
APPLY_SPECIFIC(previous_input_shape)[i]); APPLY_SPECIFIC(previous_input_shape)[i]);
...@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -115,7 +128,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// Store the shapes of the inputs and kernels as well as the chosen // Store the shapes of the inputs and kernels as well as the chosen
// algorithm for future use. // algorithm for future use.
APPLY_SPECIFIC(previous_algo) = chosen_algo; APPLY_SPECIFIC(previous_algo) = chosen_algo;
for (int i = 0; i < 4; i++) for (int i = 0; i < nb_dim; i++)
{ {
APPLY_SPECIFIC(previous_input_shape)[i] = APPLY_SPECIFIC(previous_input_shape)[i] =
CudaNdarray_HOST_DIMS(input)[i]; CudaNdarray_HOST_DIMS(input)[i];
...@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -142,7 +155,8 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
// If the chosen implementation is FFT, validate that it can be used // If the chosen implementation is FFT, validate that it can be used
// on the current data and default on a safe implementation if it // on the current data and default on a safe implementation if it
// can't. // can't.
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) // Following code is 2d-specific, but it is fine as ftt is define only for 2d-filters
if (chosen_algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT && nb_dim == 4)
{ {
// Extract the properties of the convolution descriptor // Extract the properties of the convolution descriptor
...@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -186,12 +200,12 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
chosen_algo, chosen_algo,
&worksize); &worksize);
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here" << std::endl;
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuDnnConv: error getting worksize: %s", "GpuDnnConv: error getting worksize: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
} }
workspace = get_work_mem(worksize); workspace = get_work_mem(worksize);
if (workspace == NULL && worksize != 0) if (workspace == NULL && worksize != 0)
return 1; return 1;
...@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns, ...@@ -208,6 +222,7 @@ APPLY_SPECIFIC(conv_fwd)(CudaNdarray *input, CudaNdarray *kerns,
APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output)); APPLY_SPECIFIC(output), CudaNdarray_DEV_DATA(*output));
} }
if (err != CUDNN_STATUS_SUCCESS) { if (err != CUDNN_STATUS_SUCCESS) {
std::cout << "here2" << std::endl;
PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s", PyErr_Format(PyExc_RuntimeError, "GpuDnnConv: error doing operation: %s",
cudnnGetErrorString(err)); cudnnGetErrorString(err));
return 1; return 1;
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
CudaNdarray *im, cudnnConvolutionDescriptor_t desc, CudaNdarray *im, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **input) { float alpha, float beta, int nb_dim, CudaNdarray **input) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) { if (CudaNdarray_HOST_DIMS(im)[1] != CudaNdarray_HOST_DIMS(kerns)[1]) {
...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
return 1; return 1;
} }
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) /* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
/* if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) if (c_set_filterNd(kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
...@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output, ...@@ -22,13 +27,16 @@ APPLY_SPECIFIC(conv_gi)(CudaNdarray *kerns, CudaNdarray *output,
*input = im; *input = im;
Py_INCREF(*input); Py_INCREF(*input);
#else #else
if (CudaNdarray_prep_output(input, 4, CudaNdarray_HOST_DIMS(im)) != 0) if (CudaNdarray_prep_output(input, nb_dim, CudaNdarray_HOST_DIMS(im)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*input, im))
return 1; return 1;
#endif #endif
if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) /* if (c_set_tensor4d(*input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
if (c_set_tensorNd(*input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
{ {
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
int int
APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
CudaNdarray *km, cudnnConvolutionDescriptor_t desc, CudaNdarray *km, cudnnConvolutionDescriptor_t desc,
float alpha, float beta, CudaNdarray **kerns) { float alpha, float beta, int nb_dim, CudaNdarray **kerns) {
cudnnStatus_t err = CUDNN_STATUS_SUCCESS; cudnnStatus_t err = CUDNN_STATUS_SUCCESS;
if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) { if (CudaNdarray_HOST_DIMS(input)[1] != CudaNdarray_HOST_DIMS(km)[1]) {
...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -12,9 +12,14 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
return 1; return 1;
} }
if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) /* if (c_set_tensor4d(input, APPLY_SPECIFIC(input)) == -1) */
/* return 1; */
/* if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) */
/* return 1; */
if (c_set_tensorNd(input, nb_dim, APPLY_SPECIFIC(input)) == -1)
return 1; return 1;
if (c_set_tensor4d(output, APPLY_SPECIFIC(output)) == -1) if (c_set_tensorNd(output, nb_dim, APPLY_SPECIFIC(output)) == -1)
return 1; return 1;
#ifdef CONV_INPLACE #ifdef CONV_INPLACE
...@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output, ...@@ -22,13 +27,15 @@ APPLY_SPECIFIC(conv_gw)(CudaNdarray *input, CudaNdarray *output,
*kerns = km; *kerns = km;
Py_INCREF(*kerns); Py_INCREF(*kerns);
#else #else
if (CudaNdarray_prep_output(kerns, 4, CudaNdarray_HOST_DIMS(km)) != 0) if (CudaNdarray_prep_output(kerns, nb_dim, CudaNdarray_HOST_DIMS(km)) != 0)
return 1; return 1;
if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km)) if (beta != 0.0 && CudaNdarray_CopyFromCudaNdarray(*kerns, km))
return 1; return 1;
#endif #endif
if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) /* if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) */
/* return 1; */
if (c_set_filterNd(*kerns, nb_dim, APPLY_SPECIFIC(kerns)) == -1)
return 1; return 1;
{ {
......
...@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d ...@@ -13,6 +13,7 @@ from theano.tensor.signal.downsample import max_pool_2d
from theano.tensor.signal.downsample import DownsampleFactorMaxGrad from theano.tensor.signal.downsample import DownsampleFactorMaxGrad
import theano.sandbox.cuda.dnn as dnn import theano.sandbox.cuda.dnn as dnn
from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty from theano.sandbox.cuda.basic_ops import GpuAllocEmpty, gpu_alloc_empty
from theano.sandbox.cuda import float32_shared_constructor as shared
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
...@@ -763,6 +764,58 @@ def test_dnn_conv_grad(): ...@@ -763,6 +764,58 @@ def test_dnn_conv_grad():
utt.verify_grad(dconvw, [img_val, kern_val, out_val]) utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def test_conv3d_valid():
print dnn.version()
if not cuda.dnn.dnn_available():
raise SkipTest('"3D conv not supported in cudnn v1')
def run_conv3d_valid(inputs_shape, filters_shape,
subsample=(1, 1, 1)):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[0]).astype('float32'))
conv_ref = theano.tensor.nnet.conv3D(V=inputs.dimshuffle(0, 2, 3, 4, 1),
W=filters.dimshuffle(0, 2, 3, 4, 1),
b=bias, d=subsample)
conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
border_mode="valid", subsample=subsample, conv_mode='cross')
f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3))
f = theano.function([], conv, mode=mode_with_gpu)
res_ref = f_ref()
res = f()
print res_ref.shape, res.shape
utt.assert_allclose(res_ref, res)
run_conv3d_valid(inputs_shape=(128, 3, 5, 5, 5),
filters_shape=(64, 3, 1, 2, 4))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 4, 20, 12, 15),
filters_shape=(10, 4, 6, 12, 4),
subsample=(2, 2, 2))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 2, 20, 12, 15),
filters_shape=(10, 2, 6, 12, 4),
subsample=(3, 3, 3))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(3, 2, 1))
run_conv3d_valid(inputs_shape=(16, 1, 20, 12, 15),
filters_shape=(10, 1, 6, 12, 4),
subsample=(1, 2, 3))
def test_version(): def test_version():
if not cuda.dnn.dnn_available(): if not cuda.dnn.dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论