提交 850e8902 authored 作者: JesseLivezey's avatar JesseLivezey

CorrMM optimizations for abstractconv2d

上级 62ccf59f
...@@ -124,9 +124,9 @@ TODO: Give examples on how to use these things! They are pretty complicated. ...@@ -124,9 +124,9 @@ TODO: Give examples on how to use these things! They are pretty complicated.
This is a CPU-only 2d correlation implementation taken from This is a CPU-only 2d correlation implementation taken from
`caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp>`_ `caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp>`_
and also used by Torch. It does not flip the kernel. As it provides a gradient, and also used by Torch. It does not flip the kernel. As it provides a gradient,
you can use it as a replacement for nnet.conv2d. There is currently no you can use it as a replacement for nnet.conv2d. For convolutions done on
optimization to move this to GPU. This will be added when the new convolution CPU, nnet.conv2d will be replaced by CorrMM. To explicitly disable it, set
interface is finished. ``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
- :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only - :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
convolution using NVIDIA's cuDNN library. This requires that you have convolution using NVIDIA's cuDNN library. This requires that you have
cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU
......
...@@ -212,7 +212,7 @@ class TestConv2d(unittest.TestCase): ...@@ -212,7 +212,7 @@ class TestConv2d(unittest.TestCase):
provide_shape=provide_shape, border_mode=b, provide_shape=provide_shape, border_mode=b,
filter_flip=flip) filter_flip=flip)
def test_cormm_conv(self): def test_gpucormm_conv(self):
if not dnn_available(): if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
...@@ -240,11 +240,39 @@ class TestConv2d(unittest.TestCase): ...@@ -240,11 +240,39 @@ class TestConv2d(unittest.TestCase):
provide_shape=provide_shape, border_mode=b, provide_shape=provide_shape, border_mode=b,
filter_flip=flip) filter_flip=flip)
def test_cpu_conv(self): def test_cormm_conv(self):
if not dnn_available(): if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg) raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product( for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes), zip(self.inputs_shapes, self.filters_shapes),
self.subsamples, self.subsamples,
......
...@@ -4,16 +4,9 @@ Define abstract conv2d interface ...@@ -4,16 +4,9 @@ Define abstract conv2d interface
import logging import logging
import theano import theano
from theano.tensor import (as_tensor_variable, patternbroadcast) from theano.tensor import as_tensor_variable
from theano.tensor import TensorType
from theano.gof import Apply, Op from theano.gof import Apply, Op
from theano.gof import local_optimizer
from theano.tensor.opt import register_specialize_device
# Cpu implementation
from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
from theano.tensor.nnet.ConvGrad3D import convGrad3D
from theano.tensor.nnet.ConvTransp3D import convTransp3D
__docformat__ = "restructuredtext en" __docformat__ = "restructuredtext en"
...@@ -326,218 +319,3 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d): ...@@ -326,218 +319,3 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
def connection_pattern(self, node): def connection_pattern(self, node):
return [[1], [1], [0]] # no connection to height, width return [[1], [1], [0]] # no connection to height, width
# Cpu Optmization
@local_optimizer([AbstractConv2d])
def local_conv2d_cpu(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if ((not isinstance(img.type, TensorType) or
not isinstance(kern.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return None
rval = cpu_conv2d(img, kern,
node.op.imshp, node.op.kshp,
border_mode=node.op.border_mode,
subsample=node.op.subsample)
return [rval]
register_specialize_device(local_conv2d_cpu, 'fast_compile')
@local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_cpu(node):
img, topgrad, shape = node.inputs
if ((not isinstance(img.type, TensorType) or
not isinstance(topgrad.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return
if node.op.border_mode == 'valid' and \
(node.op.subsample != (1, 1)):
# Use the gradient as defined in conv3D, because the implementation
# by Conv is slow (about 3x slower than conv3D, and probably 10x
# slower than it could be), nad incorrect when subsample > 2.
# build a "node", that should be equivalent to the one given by
# self.make_node, but using convGrad3D instead.
shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
rval = convGrad3D(V=shuffled_img,
d=(node.op.subsample[0], node.op.subsample[1], 1),
WShape=(shuffled_topgrad.shape[4],
shape[0], shape[1], 1,
shuffled_img.shape[4]),
dCdH=shuffled_topgrad)
rval = theano.tensor.addbroadcast(rval, 3)
rval = rval.dimshuffle(0, 4, 1, 2)
rval = rval[:, :, ::-1, ::-1]
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
return [rval]
dx, dy = node.op.subsample
if dx not in (1, 2) or dy not in (1, 2):
# Not implemented in the gradient of ConvOp
return None
if node.op.imshp is None:
op_imshp = (None, None, None, None)
else:
op_imshp = node.op.imshp
if node.op.kshp is None:
op_kshp = (None, None, None, None)
else:
op_kshp = node.op.kshp
if None in op_imshp or None in op_kshp:
if (dx, dy) != (1, 1):
# We cannot infer the shapes
return None
# Determine gradient on kernels
assert len(op_imshp) == 4 and len(op_kshp) == 4
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
newimg = img.dimshuffle((1, 0, 2, 3))
newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
if node.op.border_mode == 'valid':
(img, filters) = (newimg, newtopgrad)
kshp_logical = fulloutshp
kshp_logical_top_aligned = False
imshp_logical = None
(bsize, nkern) = (op_imshp[1], op_kshp[0])
imshp = (op_imshp[0], op_imshp[2], op_imshp[3])
kshp = outshp
elif node.op.border_mode == 'full':
(img, filters) = (newtopgrad, newimg)
kshp_logical = None
kshp_logical_top_aligned = True
imshp_logical = (op_imshp[0],
fulloutshp[0],
fulloutshp[1])
(bsize, nkern) = (op_kshp[0], op_imshp[1])
imshp = (op_imshp[0], outshp[0], outshp[1])
kshp = op_imshp[2:]
else:
raise NotImplementedError(
'Only [full,valid] modes are currently supported.')
# Flip the kernels
filters = filters[:, :, ::-1, ::-1]
dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=None, unroll_kern=None, unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=kshp_logical,
kshp_logical_top_aligned=kshp_logical_top_aligned,
direction_hint='bprop weights')
res = dw(img, filters)
if node.op.border_mode == 'valid':
res = res.dimshuffle((1, 0, 2, 3))
res = res[:, :, ::-1, ::-1]
res = patternbroadcast(res, node.outputs[0].broadcastable)
return [res]
register_specialize_device(local_conv2d_gradweight_cpu, 'fast_compile')
@local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_cpu(node):
kern, topgrad, shape = node.inputs
if ((not isinstance(kern.type, TensorType) or
not isinstance(topgrad.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return None
# Conv 3d implementation, needed when subsample > 2
if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
kern = kern[:, :, ::-1, ::-1]
shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
b = theano.tensor.zeros_like(shuffled_kern[0, 0, 0, 0, :])
rval = convTransp3D(W=shuffled_kern, b=b,
d=(node.op.subsample[0], node.op.subsample[1], 1),
H=shuffled_topgrad,
RShape=(shape[0], shape[1], 1))
rval = theano.tensor.addbroadcast(rval, 3)
rval = rval.dimshuffle(0, 4, 1, 2)
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
return [rval]
# Conv2d Implementation
dx, dy = node.op.subsample
if dx not in (1, 2) or dy not in (1, 2):
# Not implemented in the gradient of ConvOp
return None
if node.op.imshp is None:
op_imshp = (None, None, None, None)
else:
op_imshp = node.op.imshp
if node.op.kshp is None:
op_kshp = (None, None, None, None)
else:
op_kshp = node.op.kshp
if None in op_imshp or None in op_kshp:
if (dx, dy) != (1, 1):
return None
mode = 'valid'
if not node.op.border_mode == 'full':
mode = 'full'
filters = kern.dimshuffle((1, 0, 2, 3))
filters = filters[:, :, ::-1, ::-1]
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
nkern = op_imshp[1]
imshp = (op_kshp[0], outshp[0], outshp[1])
imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
din = ConvOp(imshp,
op_kshp[2:],
nkern,
op_imshp[0],
1, 1, output_mode=mode,
unroll_batch=None, unroll_kern=None,
unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=None,
version=-1,
direction_hint='bprop inputs')
din = din(topgrad, filters)
din = patternbroadcast(din, node.outputs[0].broadcastable)
return [din]
register_specialize_device(local_conv2d_gradinputs_cpu, 'fast_compile')
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论