提交 4736c9b3 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2665 from ballasn/conv2d_interface

New conv2d interface (work in progress)
...@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i ...@@ -13,6 +13,9 @@ from theano.compile.ops import shape_i
from theano.tensor.nnet import SoftmaxGrad from theano.tensor.nnet import SoftmaxGrad
from theano.tensor.signal.downsample import ( from theano.tensor.signal.downsample import (
DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad) DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
from theano.tensor.opt import register_specialize_device
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
host_from_gpu, host_from_gpu,
...@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt ...@@ -27,6 +30,12 @@ from theano.sandbox.cuda import gpu_seqopt, register_opt
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
from theano.tensor.nnet.abstract_conv2d import (AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.opt import register_specialize_device
def dnn_available(): def dnn_available():
if dnn_available.avail is None: if dnn_available.avail is None:
...@@ -1276,6 +1285,58 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), ...@@ -1276,6 +1285,58 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
return GpuDnnConv3d(algo=algo)(img, kerns, out, desc) return GpuDnnConv3d(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad,
kerns_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to weight using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
img = gpu_contiguous(img)
topgrad = gpu_contiguous(topgrad)
kerns_shp = theano.tensor.as_tensor_variable(kerns_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img.shape, kerns_shp)
out = gpu_alloc_empty(*kerns_shp)
return GpuDnnConvGradW()(img, topgrad, out, desc)
def dnn_gradinput(kerns, topgrad,
img_shp,
border_mode='valid', subsample=(1, 1),
conv_mode='conv'):
"""
GPU convolution gradient with respect to input using cuDNN from NVIDIA.
The memory layout to use is 'bc01', that is 'batch', 'channel',
'first dim', 'second dim' in that order.
FIXME parameters doc
:warning: The cuDNN library only works with GPU that have a compute
capability of 3.0 or higer. This means that older GPU will not
work with this Op.
"""
kerns = gpu_contiguous(kerns)
topgrad = gpu_contiguous(topgrad)
img_shp = theano.tensor.as_tensor_variable(img_shp)
desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)(img_shp, kerns.shape)
out = gpu_alloc_empty(*img_shp)
return GpuDnnConvGradI()(kerns, topgrad, out, desc)
class GpuDnnPoolDesc(GpuOp): class GpuDnnPoolDesc(GpuOp):
""" """
This Op builds a pooling descriptor for use in the other pooling operations. This Op builds a pooling descriptor for use in the other pooling operations.
...@@ -2383,3 +2444,47 @@ if True: ...@@ -2383,3 +2444,47 @@ if True:
gpu_contiguous(ins[1]) gpu_contiguous(ins[1])
) )
return [out.dimshuffle(0, 1)] return [out.dimshuffle(0, 1)]
### AbstractConv Optimizations
@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_abstractconv_cudnn(node):
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((not isinstance(node.op, AbstractConv2d) or
not isinstance(node.op, AbstractConv2d_gradWeights) or
not isinstance(node.op, AbstractConv2d_gradInputs))):
return None
if not isinstance(inp1.type, CudaNdarrayType) or \
not isinstance(inp2.type, CudaNdarrayType):
return None
if not dnn_available():
return None
if node.op.filters_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
if (isinstance(node.op, AbstractConv2d)):
rval = dnn_conv(inp1, inp2,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
direction_hint='forward',
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradWeights)):
shape = (inp2.shape[1], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradweight(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
if (isinstance(node.op, AbstractConv2d_gradInputs)):
shape = (inp2.shape[0], inp1.shape[1], node.inputs[2][0], node.inputs[2][1])
rval = dnn_gradinput(inp1, inp2, shape,
border_mode=node.op.border_mode,
subsample=node.op.subsample,
conv_mode = conv_mode)
return [rval]
...@@ -75,6 +75,12 @@ from theano.tensor import slinalg ...@@ -75,6 +75,12 @@ from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from theano.tensor.nnet.abstract_conv2d import (BaseAbstractConv2d, AbstractConv2d,
AbstractConv2d_gradWeights,
AbstractConv2d_gradInputs)
from theano.tensor.opt import register_specialize_device
try: try:
# We need to be able to import this file even if cuda isn't avail. # We need to be able to import this file even if cuda isn't avail.
from theano.sandbox.cuda import device_properties from theano.sandbox.cuda import device_properties
...@@ -2622,3 +2628,179 @@ optdb.register('local_inplace_gpu_sparse_block_outer', ...@@ -2622,3 +2628,179 @@ optdb.register('local_inplace_gpu_sparse_block_outer',
import theano.sandbox.cuda.extra_ops import theano.sandbox.cuda.extra_ops
### Move to Gpu optimization
@local_optimizer([gpu_from_host,
AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_conv2d_gpu_conv(node):
"""
gpu_from_host(AbstractConv) -> AbstractConv(gpu_from_host)
AbstractConv(host_from_gpu) -> host_from_gpu(AbstractConv)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, BaseAbstractConv2d):
conv = host_input.owner.op
inps = list(host_input.owner.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
return [out]
if isinstance(node.op, BaseAbstractConv2d):
# conv(host_from_gpu) -> host_from_gpu(gpu_conv)
inp1 = node.inputs[0]
inp2 = node.inputs[1]
if ((isinstance(inp1.type, CudaNdarrayType) and
isinstance(inp2.type, CudaNdarrayType))):
# Both inputs are already directly on the GPU, nothing to do
return
inp1_on_gpu = (isinstance(inp1.type, CudaNdarrayType) or
(inp1.owner and isinstance(inp1.owner.op, HostFromGpu)))
inp2_on_gpu = (isinstance(inp2.type, CudaNdarrayType) or
(inp2.owner and isinstance(inp2.owner.op, HostFromGpu)))
if inp1_on_gpu or inp2_on_gpu:
conv = node.op
inps = list(node.inputs)
inps[0] = as_cuda_ndarray_variable(inps[0])
inps[1] = as_cuda_ndarray_variable(inps[1])
out = conv(*inps)
# out is on the GPU because both inputs are.
out = theano.tensor.patternbroadcast(
out,
node.outputs[0].broadcastable)
out.values_eq_approx = values_eq_approx_high_tol
# If the original output was on CPU, we have to transfer it
if isinstance(node.outputs[0].type, tensor.TensorType):
return [tensor.as_tensor_variable(out)]
else:
return [out]
register_opt()(local_conv2d_gpu_conv)
### Corrmm opt
@local_optimizer([AbstractConv2d])
def local_abstractconv_gemm(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if (not isinstance(img.type, CudaNdarrayType) or
not isinstance(kern.type, CudaNdarrayType)):
return None
border_mode = node.op.border_mode
subsample = node.op.subsample
if (border_mode == 'full') and (subsample == (1, 1)):
if not node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# need to dimshuffle the kernel for full convolution
kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img))
else:
# need to flip the kernel if necessary
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
# By default use GpuCorrMM
rval = GpuCorrMM(border_mode, subsample)(gpu_contiguous(img),
gpu_contiguous(kern))
# call GpuCorrMM_gradWeights if good
# (the latter is faster if batchsize * kernelHeight * kernelWidth
# is larger than inputChannels * outputHeight * outputWidth.
# GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.)
if ((subsample == (1,1)) and
(node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and
(None not in node.op.kshp)):
# we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1))
if (None not in node.op.imshp[:1]):
# we also know batchsize and input channels
prod1 *= node.op.imshp[0]
prod2 *= node.op.imshp[1]
# compare to decide
if prod1 > prod2:
# (we need to wrap the result in as_cuda_ndarray_variable,
# because we are not allowed to replace a CudaNdarray with
# a DimShuffle instance in a graph optimization)
rval = theano.sandbox.cuda.as_cuda_ndarray_variable(
GpuCorrMM_gradWeights(border_mode, subsample)(
gpu_contiguous(img.dimshuffle(1, 0, 2, 3)),
gpu_contiguous(kern.dimshuffle(1, 0, 2, 3))
).dimshuffle(1, 0, 2, 3))
return [rval]
@local_optimizer([AbstractConv2d_gradWeights])
def local_abstractconv_gradweight_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradWeights):
return None
img, topgrad, shape = node.inputs
if not isinstance(img.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
rval = GpuCorrMM_gradWeights(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(img), gpu_contiguous(topgrad), shape)
if node.op.filter_flip:
rval = rval[:, :, ::-1, ::-1]
rval = tensor.patternbroadcast(rval, node.outputs[0].broadcastable)
rval = as_cuda_ndarray_variable(rval)
return [rval]
@local_optimizer([AbstractConv2d_gradInputs])
def local_abstractconv_gradinputs_gemm(node):
if not isinstance(node.op, AbstractConv2d_gradInputs):
return None
kern, topgrad, shape = node.inputs
if not isinstance(kern.type, CudaNdarrayType) or \
not isinstance(topgrad.type, CudaNdarrayType):
return None
if node.op.filter_flip:
kern = kern[:, :, ::-1, ::-1]
rval = GpuCorrMM_gradInputs(border_mode=node.op.border_mode,
subsample=node.op.subsample)(
gpu_contiguous(kern), gpu_contiguous(topgrad), shape)
return [rval]
# Register GPU convolution implementation
# They are tried in a specific order so we can control
# which ones take precedence over others.
abstractconv_groupopt = theano.gof.optdb.LocalGroupDB()
abstractconv_groupopt.__name__ = "gpu_abstractconv_opts"
register_specialize_device(abstractconv_groupopt, 'gpu', 'fast_compile')
# cuDNN is first, but only registered if cuDNN is available.
conv_groupopt.register('local_abstractconv_dnn', dnn.local_abstractconv_cudnn, 20,
'conv_dnn',
'gpu', 'fast_compile', 'fast_run', 'cudnn')
# The GEMM-based convolution comes last to catch all remaining cases.
# It can be disabled by excluding 'conv_gemm'.
conv_groupopt.register('local_abstractconv_gemm', local_abstractconv_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
conv_groupopt.register('local_abstractconv_gradweight_gemm',
local_abstractconv_gradweight_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
conv_groupopt.register('local_abstractconv_gradinputs_gemm',
local_abstractconv_gradinputs_gemm, 30,
'conv_gemm',
'gpu', 'fast_compile', 'fast_run')
import unittest
import numpy
import itertools
import theano
from theano.tests import unittest_tools as utt
import theano.tensor.nnet.abstract_conv2d as conv
from theano.sandbox.cuda import float32_shared_constructor as gpu_shared
from theano.compile import shared as cpu_shared
from theano.sandbox.cuda.dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda
if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
mode_without_gpu = theano.compile.get_default_mode().excluding('gpu')
class TestConv2d(unittest.TestCase):
def setUp(self):
super(TestConv2d, self).setUp()
self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
(6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
(1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
self.subsamples = [(1, 1), (2, 2), (2, 4)]
self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
self.filter_flip = [True, False]
def get_output_shape(self, inputs_shape, filters_shape, subsample, border_mode):
if border_mode == "valid":
border_mode = (0, 0)
if border_mode == "full":
border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
batch_size = inputs_shape[0]
num_filters = filters_shape[0]
return (batch_size, num_filters,) \
+ tuple(None if i is None or k is None
else ((i + 2 * pad - k) // d + 1)
for i, k, d, pad in zip(inputs_shape[2:], filters_shape[2:],
subsample, border_mode))
def run_fwd(self, inputs_shape, filters_shape, ref=dnn_conv,
subsample=(1, 1), verify_grad=True, mode=mode_without_gpu,
border_mode='valid', filter_flip=True, device='cpu', provide_shape=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
if device == 'gpu':
inputs = gpu_shared(inputs_val)
filters = gpu_shared(filters_val)
else:
inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c_ref = ref(inputs, filters,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
c = conv.conv2d(inputs, filters,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
input_shape=imshp,
filter_shape=kshp)
f_ref = theano.function([], c_ref, mode=mode)
f = theano.function([], c, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
if verify_grad:
utt.verify_grad(conv.AbstractConv2d(border_mode="valid", imshp=imshp, kshp=kshp,
subsample=subsample),
[inputs_val, filters_val],
mode=mode)
def run_gradweight(self, inputs_shape, filters_shape, output_shape,
ref=dnn_gradweight, subsample=(1, 1), filter_flip=True,
verify_grad=True, mode=mode_without_gpu, border_mode='valid',
device='cpu', provide_shape=False):
inputs_val = numpy.random.random(inputs_shape).astype('float32')
output_val = numpy.random.random(output_shape).astype('float32')
if device == 'gpu':
inputs = gpu_shared(inputs_val)
output = gpu_shared(output_val)
else:
inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
filter_flip=filter_flip,
subsample=subsample,
imshp=imshp, kshp=kshp)
c = c(inputs, output, filters_shape[-2:])
c_ref = ref(inputs, output,
filters_shape,
border_mode=border_mode,
subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], c, mode)
f_ref = theano.function([], c_ref, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
def abstract_conv2d_gradweight(inputs_val, output_val):
conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode, subsample=subsample)
return conv_op(inputs_val, output_val, filters_shape[-2:])
if verify_grad:
utt.verify_grad(abstract_conv2d_gradweight, [inputs_val, output_val],
mode=mode, eps=1)
def run_gradinput(self, inputs_shape, filters_shape, output_shape, ref=dnn_gradinput,
subsample=(1, 1), filter_flip=True, verify_grad=True, mode=mode_without_gpu,
border_mode='valid', device='cpu', provide_shape=False):
output_val = numpy.random.random(output_shape).astype('float32')
filters_val = numpy.random.random(filters_shape).astype('float32')
if device == 'gpu':
output = gpu_shared(output_val)
filters = gpu_shared(filters_val)
else:
output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
if provide_shape:
imshp = inputs_shape
kshp = filters_shape
else:
imshp = None
kshp = None
if filter_flip:
conv_mode = 'conv'
else:
conv_mode = 'cross'
c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip,
imshp=imshp, kshp=kshp)
c = c(filters, output, inputs_shape[-2:])
c_ref = ref(filters, output, inputs_shape,
border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode)
f = theano.function([], c, mode)
f_ref = theano.function([], c_ref, mode)
res_ref = numpy.array(f_ref())
res = numpy.array(f())
utt.assert_allclose(res_ref, res)
def abstract_conv2d_gradinputs(filters_val, output_val):
conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode, subsample=subsample)
return conv_op(filters_val, output_val, inputs_shape[-2:])
if verify_grad:
utt.verify_grad(abstract_conv2d_gradinputs, [filters_val, output_val],
mode=mode, eps=1)
def test_dnn_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_with_gpu
# provide_shape is not used by the CuDNN impementation
provide_shape = False
for (i, f), s, b, flip in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_with_gpu.excluding('cudnn')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='gpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
fwd_OK = True
gradweight_OK = True
gradinput_OK = True
if not flip:
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if b not in ('valid', 'full'):
fwd_OK = False
gradweight_OK = False
gradinput_OK = False
if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
gradweight_OK = False
gradinput_OK = False
if fwd_OK:
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_fwd,
inputs_shape=i,
filters_shape=f,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradweight_OK:
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradweight,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
if gradinput_OK:
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=False, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
else:
self.assertRaises(NotImplementedError,
self.run_gradinput,
inputs_shape=i,
filters_shape=f,
output_shape=o,
subsample=s,
verify_grad=False,
mode=mode,
device='cpu',
provide_shape=provide_shape,
border_mode=b,
filter_flip=flip)
...@@ -158,9 +158,9 @@ class Conv3D(theano.Op): ...@@ -158,9 +158,9 @@ class Conv3D(theano.Op):
vidDur = V_shape[3] vidDur = V_shape[3]
filterDur = W_shape[3] filterDur = W_shape[3]
output_height = T.floor((vidHeight - filterHeight) // dr) + 1 output_height = ((vidHeight - filterHeight) // dr) + 1
output_width = T.floor((vidWidth - filterWidth) // dc) + 1 output_width = ((vidWidth - filterWidth) // dc) + 1
output_dur = T.floor((vidDur - filterDur) // dt) + 1 output_dur = ((vidDur - filterDur) // dt) + 1
rval = (batch_size, output_height, output_width, output_dur, output_channels) rval = (batch_size, output_height, output_width, output_dur, output_channels)
......
"""
Define abstract conv2d interface
"""
import logging
import theano
from theano.tensor import (as_tensor_variable, patternbroadcast)
from theano.tensor import TensorType
from theano.gof import Apply, Op
from theano.gof import local_optimizer
from theano.tensor.opt import register_specialize_device
# Cpu implementation
from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
from theano.tensor.nnet.ConvGrad3D import convGrad3D
from theano.tensor.nnet.ConvTransp3D import convTransp3D
__docformat__ = "restructuredtext en"
_logger = logging.getLogger("theano.tensor.nnet.conv2d")
def conv2d(input,
filters,
input_shape=None,
filter_shape=None,
border_mode='valid',
subsample=(1, 1),
filter_flip=True):
"""
This function will build the symbolic graph for convolving a mini-batch of a
stack of 2D inputs with a set of 2D filters. The implementation is modelled
after Convolutional Neural Networks (CNN).
:type input: symbolic 4D tensor
:param input: mini-batch of feature map stacks, of shape
(batch size, input channels, input rows, input columns).
See the optional parameter ``input_shape``.
:type filters: symbolic 4D tensor
:param filters: set of filters used in CNN layer of shape
(output channels, input channels, filter rows, filter columns).
See the optional parameter ``filter_shape``.
:type input_shape: None, tuple/list of len 4 of int or Constant variable
:param input_shape: The shape of the input parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
element is not known at compile time.
:type filter_shape: None, tuple/list of len 4 of int or Constant variable
:param filter_shape: The shape of the filters parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
element is not known at compile time.
:type border_mode: str, int or tuple of two int
:param border_mode: Either of the following:
* ``'valid'``: apply filter wherever it completely overlaps with the
input. Generates output of shape: input shape - filter shape + 1
* ``'full'``: apply filter wherever it partly overlaps with the input.
Generates output of shape: input shape + filter shape - 1
* ``'half'``: pad input with a symmetric border of ``filter rows // 2``
rows and ``filter columns // 2`` columns, then perform a valid
convolution. For filters with an odd number of rows and columns, this
leads to the output shape being equal to the input shape.
* ``int``: pad input with a symmetric border of zeros of the given
width, then perform a valid convolution.
* ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
and ``int2`` columns, then perform a valid convolution.
:type subsample: tuple of len 2
:param subsample: factor by which to subsample the output.
Also called strides elsewhere.
:type filter_flip: bool
:param filter_flip: If ``True``, will flip the filter rows and columns
before sliding them over the input. This operation is normally referred
to as a convolution, and this is the default. If ``False``, the filters
are not flipped and the operation is referred to as a cross-correlation.
:rtype: symbolic 4D tensor
:return: set of feature maps generated by convolutional layer. Tensor is
of shape (batch size, output channels, output rows, output columns)
"""
conv_op = AbstractConv2d(imshp=input_shape,
kshp=filter_shape,
border_mode=border_mode,
subsample=subsample,
filter_flip=filter_flip)
return conv_op(input, filters)
class BaseAbstractConv2d(Op):
"""
Base class for AbstractConv
Define an abstract convolution op that will be replaced with the appropriate implementation
:type imshp: None, tuple/list of len 4 of int or Constant variable
:param imshp: The shape of the input parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
element is not known at compile time.
imshp is defined w.r.t the forward conv.
:type kshp: None, tuple/list of len 4 of int or Constant variable
:param kshp: The shape of the filters parameter.
Optional, possibly used to choose an optimal implementation.
You can give ``None`` for any element of the list to specify that this
element is not known at compile time.
kshp is defined w.r.t the forward conv.
:type border_mode: str, int or tuple of two int
:param border_mode: Either of the following:
* ``'valid'``: apply filter wherever it completely overlaps with the
input. Generates output of shape: input shape - filter shape + 1
* ``'full'``: apply filter wherever it partly overlaps with the input.
Generates output of shape: input shape + filter shape - 1
* ``'half'``: pad input with a symmetric border of ``filter rows // 2``
rows and ``filter columns // 2`` columns, then perform a valid
convolution. For filters with an odd number of rows and columns, this
leads to the output shape being equal to the input shape.
* ``int``: pad input with a symmetric border of zeros of the given
width, then perform a valid convolution.
* ``(int1, int2)``: pad input with a symmetric border of ``int1`` rows
and ``int2`` columns, then perform a valid convolution.
:type subsample: tuple of len 2
:param subsample: factor by which to subsample the output.
Also called strides elsewhere.
:type filter_flip: bool
:param filter_flip: If ``True``, will flip the filter rows and columns
before sliding them over the input. This operation is normally referred
to as a convolution, and this is the default. If ``False``, the filters
are not flipped and the operation is referred to as a cross-correlation.
"""
check_broadcast = False
__props__ = ('border_mode', 'subsample', 'filter_flip', 'imshp', 'kshp')
def __init__(self,
imshp=None, kshp=None,
border_mode="valid", subsample=(1, 1),
filter_flip=True):
if isinstance(border_mode, int):
border_mode = (border_mode, border_mode)
if isinstance(border_mode, tuple):
pad_h, pad_w = map(int, border_mode)
border_mode = (pad_h, pad_w)
if not ((isinstance(border_mode, tuple) and min(border_mode) >= 0) or
border_mode in ('valid', 'full', 'half')):
raise ValueError(
'invalid border_mode {}, which must be either '
'"valid", "full", "half", an integer or a pair of'
' integers'.format(border_mode))
self.imshp = imshp
self.kshp = kshp
self.border_mode = border_mode
self.filter_flip = filter_flip
if len(subsample) != 2:
raise ValueError("subsample must have two elements")
self.subsample = subsample
def flops(self, inp, outp):
""" Useful with the hack in profilemode to print the MFlops"""
# if the output shape is correct, then this gives the correct
# flops for any direction, sampling, padding, and border mode
inputs, filters = inp
outputs, = outp
assert inputs[1] == filters[1]
# nb mul and add by output pixel
flops = filters[2] * filters[3] * 2
# nb flops by output image
flops *= outputs[2] * outputs[3]
# nb patch multiplied
flops *= inputs[1] * filters[0] * inputs[0]
return flops
class AbstractConv2d(BaseAbstractConv2d):
"""
Abstract Op for the forward convolution.
"""
def __init__(self,
imshp=None,
kshp=None,
border_mode="valid",
subsample=(1, 1),
filter_flip=True):
super(AbstractConv2d, self).__init__(imshp, kshp,
border_mode, subsample, filter_flip)
def make_node(self, img, kern):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
broadcastable = [img.broadcastable[0],
kern.broadcastable[0],
False, False]
output = img.type.clone(broadcastable=broadcastable)()
return Apply(self, [img, kern], [output])
def perform(self, node, inp, out_):
raise NotImplementedError('AbstractConv2d theano optimization failed')
def grad(self, inp, grads):
bottom, weights = inp
top, = grads
d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(
weights, top, bottom.shape[-2:])
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(
bottom, top, weights.shape[-2:])
return d_bottom, d_weights
class AbstractConv2d_gradWeights(BaseAbstractConv2d):
"""Gradient wrt. filters for `AbstractConv2d`.
:note: You will not want to use this directly, but rely on
Theano's automatic differentiation or graph optimization to
use it as needed.
"""
def __init__(self,
imshp=None,
kshp=None,
border_mode="valid",
subsample=(1, 1),
filter_flip=True):
super(AbstractConv2d_gradWeights, self).__init__(imshp, kshp,
border_mode, subsample, filter_flip)
# Update shape/height_width
def make_node(self, img, topgrad, shape):
if img.type.ndim != 4:
raise TypeError('img must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
shape = as_tensor_variable(shape)
broadcastable = [topgrad.broadcastable[1],
img.broadcastable[1],
False, False]
output = img.type.clone(broadcastable=broadcastable)()
return Apply(self, [img, topgrad, shape], [output])
def perform(self, node, inp, out_):
raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')
def grad(self, inp, grads):
bottom, top = inp[:2]
weights, = grads
d_bottom = AbstractConv2d_gradInputs(self.imshp, self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(weights, top, bottom.shape[-2:])
d_top = AbstractConv2d(self.imshp,
self.kshp,
self.border_mode,
self.subsample,
self.filter_flip)(bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),)
return (d_bottom, d_top) + d_height_width
def connection_pattern(self, node):
return [[1], [1], [0]] # no connection to height, width
class AbstractConv2d_gradInputs(BaseAbstractConv2d):
"""Gradient wrt. inputs for `AbstractConv2d`.
:note: You will not want to use this directly, but rely on
Theano's automatic differentiation or graph optimization to
use it as needed.
"""
def __init__(self,
imshp=None,
kshp=None,
border_mode="valid",
subsample=(1, 1),
filter_flip=True):
super(AbstractConv2d_gradInputs, self).__init__(imshp, kshp,
border_mode, subsample, filter_flip)
# Update shape/height_width
def make_node(self, kern, topgrad, shape):
if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor')
if topgrad.type.ndim != 4:
raise TypeError('topgrad must be 4D tensor')
shape = as_tensor_variable(shape)
broadcastable = [topgrad.type.broadcastable[0],
kern.type.broadcastable[1],
False, False]
output = kern.type.clone(broadcastable=broadcastable)()
return Apply(self, [kern, topgrad, shape], [output])
def perform(self, node, inp, out_):
raise NotImplementedError('AbstractConv2d_gradWeight theano optimization failed')
def grad(self, inp, grads):
weights, top = inp[:2]
bottom, = grads
d_weights = AbstractConv2d_gradWeights(self.imshp, self.kshp,
self.border_mode,
self.subsample)(bottom, top, weights.shape[-2:])
d_top = AbstractConv2d(self.imshp, self.kshp,
self.border_mode, self.subsample)(bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),)
return (d_weights, d_top) + d_height_width
def connection_pattern(self, node):
return [[1], [1], [0]] # no connection to height, width
# Cpu Optmization
@local_optimizer([AbstractConv2d])
def local_conv2d_cpu(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if ((not isinstance(img.type, TensorType) or
not isinstance(kern.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return None
rval = cpu_conv2d(img, kern,
node.op.imshp, node.op.kshp,
border_mode=node.op.border_mode,
subsample=node.op.subsample)
return [rval]
register_specialize_device(local_conv2d_cpu, 'fast_compile')
@local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_cpu(node):
img, topgrad, shape = node.inputs
if ((not isinstance(img.type, TensorType) or
not isinstance(topgrad.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return
if node.op.border_mode == 'valid' and \
(node.op.subsample != (1, 1)):
# Use the gradient as defined in conv3D, because the implementation
# by Conv is slow (about 3x slower than conv3D, and probably 10x
# slower than it could be), nad incorrect when subsample > 2.
# build a "node", that should be equivalent to the one given by
# self.make_node, but using convGrad3D instead.
shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
rval = convGrad3D(V=shuffled_img,
d=(node.op.subsample[0], node.op.subsample[1], 1),
WShape=(shuffled_topgrad.shape[4],
shape[0], shape[1], 1,
shuffled_img.shape[4]),
dCdH=shuffled_topgrad)
rval = theano.tensor.addbroadcast(rval, 3)
rval = rval.dimshuffle(0, 4, 1, 2)
rval = rval[:, :, ::-1, ::-1]
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
return [rval]
dx, dy = node.op.subsample
if dx not in (1, 2) or dy not in (1, 2):
# Not implemented in the gradient of ConvOp
return None
if node.op.imshp is None:
op_imshp = (None, None, None, None)
else:
op_imshp = node.op.imshp
if node.op.kshp is None:
op_kshp = (None, None, None, None)
else:
op_kshp = node.op.kshp
if None in op_imshp or None in op_kshp:
if (dx, dy) != (1, 1):
# We cannot infer the shapes
return None
# Determine gradient on kernels
assert len(op_imshp) == 4 and len(op_kshp) == 4
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
newimg = img.dimshuffle((1, 0, 2, 3))
newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
if node.op.border_mode == 'valid':
(img, filters) = (newimg, newtopgrad)
kshp_logical = fulloutshp
kshp_logical_top_aligned = False
imshp_logical = None
(bsize, nkern) = (op_imshp[1], op_kshp[0])
imshp = (op_imshp[0], op_imshp[2], op_imshp[3])
kshp = outshp
elif node.op.border_mode == 'full':
(img, filters) = (newtopgrad, newimg)
kshp_logical = None
kshp_logical_top_aligned = True
imshp_logical = (op_imshp[0],
fulloutshp[0],
fulloutshp[1])
(bsize, nkern) = (op_kshp[0], op_imshp[1])
imshp = (op_imshp[0], outshp[0], outshp[1])
kshp = op_imshp[2:]
else:
raise NotImplementedError(
'Only [full,valid] modes are currently supported.')
# Flip the kernels
filters = filters[:, :, ::-1, ::-1]
dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=None, unroll_kern=None, unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=kshp_logical,
kshp_logical_top_aligned=kshp_logical_top_aligned,
direction_hint='bprop weights')
res = dw(img, filters)
if node.op.border_mode == 'valid':
res = res.dimshuffle((1, 0, 2, 3))
res = res[:, :, ::-1, ::-1]
res = patternbroadcast(res, node.outputs[0].broadcastable)
return [res]
register_specialize_device(local_conv2d_gradweight_cpu, 'fast_compile')
@local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_cpu(node):
kern, topgrad, shape = node.inputs
if ((not isinstance(kern.type, TensorType) or
not isinstance(topgrad.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return None
# Conv 3d implementation, needed when subsample > 2
if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
kern = kern[:, :, ::-1, ::-1]
shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
b = theano.tensor.zeros_like(shuffled_kern[0, 0, 0, 0, :])
rval = convTransp3D(W=shuffled_kern, b=b,
d=(node.op.subsample[0], node.op.subsample[1], 1),
H=shuffled_topgrad,
RShape=(shape[0], shape[1], 1))
rval = theano.tensor.addbroadcast(rval, 3)
rval = rval.dimshuffle(0, 4, 1, 2)
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
return [rval]
# Conv2d Implementation
dx, dy = node.op.subsample
if dx not in (1, 2) or dy not in (1, 2):
# Not implemented in the gradient of ConvOp
return None
if node.op.imshp is None:
op_imshp = (None, None, None, None)
else:
op_imshp = node.op.imshp
if node.op.kshp is None:
op_kshp = (None, None, None, None)
else:
op_kshp = node.op.kshp
if None in op_imshp or None in op_kshp:
if (dx, dy) != (1, 1):
return None
mode = 'valid'
if not node.op.border_mode == 'full':
mode = 'full'
filters = kern.dimshuffle((1, 0, 2, 3))
filters = filters[:, :, ::-1, ::-1]
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
nkern = op_imshp[1]
imshp = (op_kshp[0], outshp[0], outshp[1])
imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
din = ConvOp(imshp,
op_kshp[2:],
nkern,
op_imshp[0],
1, 1, output_mode=mode,
unroll_batch=None, unroll_kern=None,
unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=None,
version=-1,
direction_hint='bprop inputs')
din = din(topgrad, filters)
din = patternbroadcast(din, node.outputs[0].broadcastable)
return [din]
register_specialize_device(local_conv2d_gradinputs_cpu, 'fast_compile')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论