提交 07bc5550 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Add dnn_conv3d and friends to the gpuarray backend.

上级 6452bbd4
......@@ -962,6 +962,122 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
conv_mode='conv', direction_hint=None,
algo='none', precision=None):
"""
GPU convolution using cuDNN from NVIDIA.
The memory layout to use is 'bc012', that is 'batch', 'channel',
'first dim', 'second dim', 'third dim' in that order.
Parameters
----------
img
Images to do the convolution over.
kerns
Convolution filters.
border_mode
One of 'valid', 'full', 'half'; additionally, the padding size
could be directly specified by an integer or a pair of integers.
subsample
Perform subsampling of the output (default: (1, 1)).
conv_mode
Perform convolution (kernels flipped) or cross-correlation.
One of 'conv', 'cross' (default: 'conv').
direction_hint
Used by graph optimizers to change algorithm choice.
By default, GpuDnnConv will be used to carry out the convolution.
If border_mode is 'valid', subsample is (1, 1) and direction_hint is
'bprop weights', it will use GpuDnnConvGradW.
If border_mode is 'full', subsample is (1, 1) and direction_hint is
*not* 'forward!', it will use GpuDnnConvGradI.
This parameter is used internally by graph optimizers and may be
removed at any time without a deprecation period. You have been warned.
algo : convolution implementation to use. Only 'none' is implemented
for the conv3d. Default is the value of :attr:`config.dnn.conv.algo_fwd`.
precision : {'as_input_f32', 'as_input', 'float16', 'float32', 'float64'}
Description of the dtype in which the computation of the convolution
should be done. Possible values are 'as_input', 'float16', 'float32'
and 'float64'. Default is the value of
:attr:`config.dnn.conv.precision`.
.. warning:: The cuDNN library only works with GPUs that have a compute
capability of 3.0 or higer. This means that older GPUs will not
work with this Op.
"""
# Establish dtype in which to perform the computation of the convolution
if precision is None:
precision = theano.config.dnn.conv.precision
if precision == 'as_input' or precision == 'as_input_f32':
nprec = theano.scalar.upcast(img.dtype, kerns.dtype)
if nprec == 'float16' and precision == 'as_input_f32':
precision = 'float32'
else:
precision = nprec
fgraph = getattr(img, 'fgraph', None) or getattr(kerns, 'fgraph', None)
ctx_name = infer_context_name(img, kerns)
if (border_mode == 'valid' and subsample == (1, 1, 1) and
direction_hint == 'bprop weights'):
# Special case: We are asked to use GpuDnnConvGradW. We need to set
# up a suitable 'fake' convolution to compute the gradient for.
img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3, 4))
if conv_mode == 'conv':
# We need to flip manually. These 'kerns' are not the kernels
# that would be flipped by conv_mode='conv' in GpuDnnConvGradW.
kerns = kerns[:, :, ::-1, ::-1]
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
shape2 = shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1
shape3 = shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1
shape4 = shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(
shape_i(kerns, 1, fgraph),
shape_i(img, 1, fgraph), shape2, shape3, shape4)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode='cross', precision=precision)(out.shape)
conv = gpu_dnn_conv_gradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1, 1) and
direction_hint != 'forward!'):
# Special case: We can be faster by using GpuDnnConvGradI to compute
# the full convolution as the backward pass of a valid convolution.
# We just need to set up a suitable 'fake' valid convolution.
img = gpu_contiguous(img) # cudnn v2 rc3 need contiguous data
kerns = gpu_contiguous(kerns.dimshuffle(1, 0, 2, 3, 4))
conv_mode = 'cross' if conv_mode == 'conv' else 'conv'
shape2 = shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1
shape3 = shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1
shape4 = shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(shape_i(img, 0, fgraph),
shape_i(kerns, 1, fgraph),
shape2, shape3, shape4)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode=conv_mode, precision=precision)(kerns.shape)
return gpu_dnn_conv_gradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy
# if the img contains negative strides
img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)(kerns.shape)
desc_op = desc.owner.op
# We can use Shape_i and bypass the infer_shape here as this is on
# the input of node and it will always be present.
ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
out_shp = get_conv_output_shape(ishape, kshape,
desc_op.border_mode,
desc_op.subsample)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(img, topgrad)
......@@ -976,6 +1092,20 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
subsample=(1, 1, 1), conv_mode='conv'):
ctx_name = infer_context_name(img, topgrad)
img = as_gpuarray_variable(img, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = as_tensor_variable(kerns_shp)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp)
return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1), conv_mode='conv'):
ctx_name = infer_context_name(kerns, topgrad)
......@@ -990,6 +1120,20 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
subsample=(1, 1, 1), conv_mode='conv'):
ctx_name = infer_context_name(kerns, topgrad)
kerns = as_gpuarray_variable(kerns, ctx_name)
topgrad = as_gpuarray_variable(topgrad, ctx_name)
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = as_tensor_variable(img_shp)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(kerns.shape)
out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp)
return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(Op):
"""
......
......@@ -779,6 +779,201 @@ def test_dnn_conv_grad():
utt.verify_grad(dconvw, [img_val, kern_val, out_val])
def get_conv3d_test_cases():
# Every element of test_shapes follows the format
# [input_shape, filter_shape, subsample]
test_shapes = [[(128, 3, 5, 5, 5), (64, 3, 1, 2, 4), (1, 1, 1)],
[(8, 4, 20, 12, 15), (5, 4, 6, 12, 4), (2, 2, 2)],
[(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 3, 3)],
[(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
[(8, 1, 20, 12, 15), (5, 1, 6, 12, 4), (3, 2, 1)],
# Test with 1x1x1 filters
[(8, 1, 10, 10, 10), (10, 1, 1, 1, 1), (1, 1, 1)],
# Test with dimensions larger than 1024 (thread block dim)
[(1025, 1, 2, 3, 4), (5, 1, 1, 2, 3), (1, 1, 1)],
[(8, 1, 2, 3, 4), (1025, 1, 1, 2, 3), (1, 1, 1)],
[(8, 1025, 2, 3, 4), (5, 1025, 1, 1, 2), (1, 1, 1)],
[(8, 1, 1030, 3, 4), (5, 1, 1025, 1, 1), (1, 1, 1)],
[(8, 1, 2, 1030, 4), (5, 1, 2, 1025, 1), (1, 1, 1)],
[(8, 1, 2, 3, 1030), (5, 1, 1, 2, 1025), (1, 1, 1)],
# The equivalent of this caused a crash with conv2d
[(1, 1, 1, 44800, 1), (6, 1, 1, 1, 1), (1, 1, 1)]]
# With border mode 'full', test with kernel bigger than image in some/all
# dimensions
test_shapes_full = [[(6, 2, 2, 2, 2), (4, 2, 3, 1, 1), (1, 1, 1)],
[(6, 2, 2, 2, 2), (4, 2, 1, 3, 1), (1, 1, 1)],
[(6, 2, 2, 2, 2), (4, 2, 1, 1, 3), (1, 1, 1)],
[(6, 2, 2, 2, 2), (4, 2, 5, 5, 5), (1, 1, 1)]]
border_modes = ['valid', 'full', 'half', (1, 2, 3), (3, 2, 1), 1, 2]
conv_modes = ['conv', 'cross']
itt = chain(product(test_shapes, border_modes, conv_modes),
product(test_shapes_full, ['full'], conv_modes))
return itt
def test_conv3d_fwd():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
def run_conv3d_fwd(inputs_shape, filters_shape, subsample,
border_mode, conv_mode):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
# Scale down the input values to prevent very large absolute errors
# due to float rounding
inputs_val /= 10
filters_val /= 10
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
bias = theano.shared(numpy.zeros(filters_shape[0]).astype('float32'))
# Compile a theano function for the cuDNN implementation
conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], conv, mode=mode_with_gpu)
# If conv_mode is 'conv' the reference implementation should use
# filters filpped according to the width, height and time axis
if conv_mode == 'conv':
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters
# If border mode is anything but 'valid', the reference implementation
# should operate on padded inputs
if border_mode == 'valid':
padded_inputs = inputs
else:
if border_mode == 'full':
pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
elif border_mode == 'half':
pad_per_dim = [filters_shape[i] // 2 for i in range(2, 5)]
else:
if isinstance(border_mode, int):
pad_per_dim = [border_mode] * 3
else:
pad_per_dim = border_mode
pad_before_after = ([(0, 0), (0, 0)] +
[(p, p) for p in pad_per_dim])
padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
'constant')
padded_inputs = theano.shared(padded_inputs_val)
# Compile a theano function for the reference implementation
conv_ref = theano.tensor.nnet.conv3D(
V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
b=bias, d=subsample)
f_ref = theano.function([], conv_ref.dimshuffle(0, 4, 1, 2, 3), mode="FAST_RUN")
# Compare the results of the two implementations
res_ref = f_ref()
res = f()
utt.assert_allclose(res_ref, res)
test_cases = get_conv3d_test_cases()
for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
yield (run_conv3d_fwd, i_shape, f_shape, subsample, border_mode,
conv_mode)
def test_conv3d_bwd():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
def run_conv3d_bwd(inputs_shape, filters_shape, subsample,
border_mode, conv_mode):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
inputs = theano.shared(inputs_val)
filters = theano.shared(filters_val)
bias = theano.shared(numpy.zeros(filters_shape[0]).astype('float32'))
# Compile a theano function for the cuDNN implementation
conv = dnn.dnn_conv3d(img=inputs, kerns=filters,
border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)
grad_i, grad_w = theano.tensor.grad(conv.sum(), [inputs, filters])
f = theano.function([], [grad_i, grad_w], mode=mode_with_gpu)
# If conv_mode is 'conv' the reference implementation should use
# filters filpped according to the width, height and time axis
if conv_mode == 'conv':
flipped_filters = filters[:, :, ::-1, ::-1, ::-1]
else:
flipped_filters = filters
# If border mode is anything but 'valid', the reference implementation
# should operate on padded inputs
if border_mode == 'valid':
padded_inputs = inputs
else:
if border_mode == 'full':
pad_per_dim = [filters_shape[i] - 1 for i in range(2, 5)]
elif border_mode == 'half':
pad_per_dim = [filters_shape[i] // 2 for i in range(2, 5)]
else:
if isinstance(border_mode, int):
pad_per_dim = [border_mode] * 3
else:
pad_per_dim = border_mode
pad_before_after = ([(0, 0), (0, 0)] +
[(p, p) for p in pad_per_dim])
padded_inputs_val = numpy.pad(inputs_val, pad_before_after,
'constant')
padded_inputs = theano.shared(padded_inputs_val)
# Compile a theano function for the reference implementation
conv_ref = theano.tensor.nnet.conv3D(
V=padded_inputs.dimshuffle(0, 2, 3, 4, 1),
W=flipped_filters.dimshuffle(0, 2, 3, 4, 1),
b=bias, d=subsample)
(grad_padded_i_ref,
grad_w_ref) = theano.tensor.grad(conv_ref.sum(),
[padded_inputs, filters])
# Recover grad_i_ref from grad_padded_i_ref
if border_mode == 'valid':
grad_i_ref = grad_padded_i_ref
else:
shp = grad_padded_i_ref.shape
grad_i_ref = grad_padded_i_ref[
:, :,
pad_per_dim[0]:shp[2] - pad_per_dim[0],
pad_per_dim[1]:shp[3] - pad_per_dim[1],
pad_per_dim[2]:shp[4] - pad_per_dim[2]]
f_ref = theano.function([], [grad_i_ref, grad_w_ref], mode="FAST_RUN")
# Compare the results of the two implementations
res_ref = f_ref()
res = f()
# Needed for big size for some seed
# raise rtol to make the test pass with more seed.
utt.assert_allclose(res_ref[0], res[0], rtol=2e-5)
utt.assert_allclose(res_ref[1], res[1], rtol=2e-5)
test_cases = get_conv3d_test_cases()
for (i_shape, f_shape, subsample), border_mode, conv_mode in test_cases:
yield (run_conv3d_bwd, i_shape, f_shape, subsample, border_mode,
conv_mode)
def test_version():
if not dnn.dnn_available(test_ctx_name):
raise SkipTest(dnn.dnn_available.msg)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论