提交 dfb27303 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #3618 from JesseLivezey/cormm_opt

CorrMM optimizations
......@@ -124,9 +124,9 @@ TODO: Give examples on how to use these things! They are pretty complicated.
This is a CPU-only 2d correlation implementation taken from
`caffe <https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp>`_
and also used by Torch. It does not flip the kernel. As it provides a gradient,
you can use it as a replacement for nnet.conv2d. There is currently no
optimization to move this to GPU. This will be added when the new convolution
interface is finished.
you can use it as a replacement for nnet.conv2d. For convolutions done on
CPU, nnet.conv2d will be replaced by CorrMM. To explicitly disable it, set
``THEANO_FLAGS=optimizer_excluding=conv_gemm`` in your environment.
- :func:`dnn_conv <theano.sandbox.cuda.dnn.dnn_conv>` GPU-only
convolution using NVIDIA's cuDNN library. This requires that you have
cuDNN installed and available, which in turn requires CUDA 6.5 and a GPU
......
......@@ -212,7 +212,7 @@ class TestConv2d(unittest.TestCase):
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cormm_conv(self):
def test_gpucormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
......@@ -240,11 +240,39 @@ class TestConv2d(unittest.TestCase):
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
def test_cormm_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
self.border_modes,
self.filter_flip,
[False, True]):
o = self.get_output_shape(i, f, s, b)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradweight(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
self.run_gradinput(inputs_shape=i, filters_shape=f,
output_shape=o, subsample=s,
verify_grad=True, mode=mode, device='cpu',
provide_shape=provide_shape, border_mode=b,
filter_flip=flip)
def test_cpu_conv(self):
if not dnn_available():
raise SkipTest(cuda.dnn.dnn_available.msg)
mode = mode_without_gpu.excluding('conv_gemm')
for (i, f), s, b, flip, provide_shape in itertools.product(
zip(self.inputs_shapes, self.filters_shapes),
self.subsamples,
......
......@@ -4,16 +4,9 @@ Define abstract conv2d interface
import logging
import theano
from theano.tensor import (as_tensor_variable, patternbroadcast)
from theano.tensor import TensorType
from theano.tensor import as_tensor_variable
from theano.gof import Apply, Op
from theano.gof import local_optimizer
from theano.tensor.opt import register_specialize_device
# Cpu implementation
from theano.tensor.nnet import conv2d as cpu_conv2d, ConvOp
from theano.tensor.nnet.ConvGrad3D import convGrad3D
from theano.tensor.nnet.ConvTransp3D import convTransp3D
__docformat__ = "restructuredtext en"
......@@ -326,218 +319,3 @@ class AbstractConv2d_gradInputs(BaseAbstractConv2d):
def connection_pattern(self, node):
return [[1], [1], [0]] # no connection to height, width
# Cpu Optmization
@local_optimizer([AbstractConv2d])
def local_conv2d_cpu(node):
if not isinstance(node.op, AbstractConv2d):
return None
img, kern = node.inputs
if ((not isinstance(img.type, TensorType) or
not isinstance(kern.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return None
rval = cpu_conv2d(img, kern,
node.op.imshp, node.op.kshp,
border_mode=node.op.border_mode,
subsample=node.op.subsample)
return [rval]
register_specialize_device(local_conv2d_cpu, 'fast_compile')
@local_optimizer([AbstractConv2d_gradWeights])
def local_conv2d_gradweight_cpu(node):
img, topgrad, shape = node.inputs
if ((not isinstance(img.type, TensorType) or
not isinstance(topgrad.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return
if node.op.border_mode == 'valid' and \
(node.op.subsample != (1, 1)):
# Use the gradient as defined in conv3D, because the implementation
# by Conv is slow (about 3x slower than conv3D, and probably 10x
# slower than it could be), nad incorrect when subsample > 2.
# build a "node", that should be equivalent to the one given by
# self.make_node, but using convGrad3D instead.
shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
rval = convGrad3D(V=shuffled_img,
d=(node.op.subsample[0], node.op.subsample[1], 1),
WShape=(shuffled_topgrad.shape[4],
shape[0], shape[1], 1,
shuffled_img.shape[4]),
dCdH=shuffled_topgrad)
rval = theano.tensor.addbroadcast(rval, 3)
rval = rval.dimshuffle(0, 4, 1, 2)
rval = rval[:, :, ::-1, ::-1]
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
return [rval]
dx, dy = node.op.subsample
if dx not in (1, 2) or dy not in (1, 2):
# Not implemented in the gradient of ConvOp
return None
if node.op.imshp is None:
op_imshp = (None, None, None, None)
else:
op_imshp = node.op.imshp
if node.op.kshp is None:
op_kshp = (None, None, None, None)
else:
op_kshp = node.op.kshp
if None in op_imshp or None in op_kshp:
if (dx, dy) != (1, 1):
# We cannot infer the shapes
return None
# Determine gradient on kernels
assert len(op_imshp) == 4 and len(op_kshp) == 4
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
newimg = img.dimshuffle((1, 0, 2, 3))
newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
if node.op.border_mode == 'valid':
(img, filters) = (newimg, newtopgrad)
kshp_logical = fulloutshp
kshp_logical_top_aligned = False
imshp_logical = None
(bsize, nkern) = (op_imshp[1], op_kshp[0])
imshp = (op_imshp[0], op_imshp[2], op_imshp[3])
kshp = outshp
elif node.op.border_mode == 'full':
(img, filters) = (newtopgrad, newimg)
kshp_logical = None
kshp_logical_top_aligned = True
imshp_logical = (op_imshp[0],
fulloutshp[0],
fulloutshp[1])
(bsize, nkern) = (op_kshp[0], op_imshp[1])
imshp = (op_imshp[0], outshp[0], outshp[1])
kshp = op_imshp[2:]
else:
raise NotImplementedError(
'Only [full,valid] modes are currently supported.')
# Flip the kernels
filters = filters[:, :, ::-1, ::-1]
dw = ConvOp(imshp, kshp, nkern, bsize, 1, 1, output_mode='valid',
unroll_batch=None, unroll_kern=None, unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=kshp_logical,
kshp_logical_top_aligned=kshp_logical_top_aligned,
direction_hint='bprop weights')
res = dw(img, filters)
if node.op.border_mode == 'valid':
res = res.dimshuffle((1, 0, 2, 3))
res = res[:, :, ::-1, ::-1]
res = patternbroadcast(res, node.outputs[0].broadcastable)
return [res]
register_specialize_device(local_conv2d_gradweight_cpu, 'fast_compile')
@local_optimizer([AbstractConv2d_gradInputs])
def local_conv2d_gradinputs_cpu(node):
kern, topgrad, shape = node.inputs
if ((not isinstance(kern.type, TensorType) or
not isinstance(topgrad.type, TensorType))):
return None
if node.op.border_mode not in ['full', 'valid']:
return None
if not node.op.filter_flip:
# Not tested yet
return None
# Conv 3d implementation, needed when subsample > 2
if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
kern = kern[:, :, ::-1, ::-1]
shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
b = theano.tensor.zeros_like(shuffled_kern[0, 0, 0, 0, :])
rval = convTransp3D(W=shuffled_kern, b=b,
d=(node.op.subsample[0], node.op.subsample[1], 1),
H=shuffled_topgrad,
RShape=(shape[0], shape[1], 1))
rval = theano.tensor.addbroadcast(rval, 3)
rval = rval.dimshuffle(0, 4, 1, 2)
rval = patternbroadcast(rval, node.outputs[0].broadcastable)
return [rval]
# Conv2d Implementation
dx, dy = node.op.subsample
if dx not in (1, 2) or dy not in (1, 2):
# Not implemented in the gradient of ConvOp
return None
if node.op.imshp is None:
op_imshp = (None, None, None, None)
else:
op_imshp = node.op.imshp
if node.op.kshp is None:
op_kshp = (None, None, None, None)
else:
op_kshp = node.op.kshp
if None in op_imshp or None in op_kshp:
if (dx, dy) != (1, 1):
return None
mode = 'valid'
if not node.op.border_mode == 'full':
mode = 'full'
filters = kern.dimshuffle((1, 0, 2, 3))
filters = filters[:, :, ::-1, ::-1]
outshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], node.op.subsample,
node.op.border_mode)
fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
op_kshp[2:], (1, 1),
node.op.border_mode)
nkern = op_imshp[1]
imshp = (op_kshp[0], outshp[0], outshp[1])
imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])
din = ConvOp(imshp,
op_kshp[2:],
nkern,
op_imshp[0],
1, 1, output_mode=mode,
unroll_batch=None, unroll_kern=None,
unroll_patch=None,
imshp_logical=imshp_logical,
kshp_logical=None,
version=-1,
direction_hint='bprop inputs')
din = din(topgrad, filters)
din = patternbroadcast(din, node.outputs[0].broadcastable)
return [din]
register_specialize_device(local_conv2d_gradinputs_cpu, 'fast_compile')
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论