提交 36694a6d authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #1454 from nouiz/conv3d2d

[MRG]Conv3d2d
......@@ -28,6 +28,7 @@ env:
- PART="-e test_basic.py theano/tensor/tests"
script:
- "if [ `expr \"$PART\" : '.*sparse'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
- "if [ `expr \"$PART\" : '.*nnet'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
- export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
- python --version
- uname -a
......
......@@ -16,9 +16,6 @@
present in convolutional neural networks (where filters are 3D and pool
over several input channels).
The project `TheanoConv3d2d <https://github.com/jaberg/TheanoConv3d2d>`_
is probably faster then the Conv3d documented here.
.. module:: conv
:platform: Unix, Windows
:synopsis: ops for signal processing
......@@ -31,6 +28,21 @@ TODO: Give examples for how to use these things! They are pretty complicated.
- :func:`signal.conv2d <theano.tensor.signal.conv.conv2d>`.
- :func:`nnet.conv2d <theano.tensor.nnet.conv.conv2d>`.
- :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`.
- :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
Another conv3d implementation that use the conv2d with data reshaping.
It is faster in some case then conv3d, specificaly on the GPU.
- `Faster conv2d <http://deeplearning.net/software/pylearn2/library/alex.html>`_
This is in Pylearn2, not very documented and use a different
memory layout for the input. It is important to have the input
in the native memory layout, and not use dimshuffle on the
inputs, otherwise you loose much of the speed up. So this is not
a drop in replacement of conv2d.
Normally those are called from the `linear transfrom
<http://deeplearning.net/software/pylearn2/library/linear.html>`_
implementation.
.. autofunction:: theano.tensor.nnet.conv.conv2d
.. autofunction:: theano.tensor.nnet.Conv3D.conv3D
.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
......@@ -12,11 +12,12 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
from theano.tensor.nnet.tests import test_conv3d2d
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode=='FAST_COMPILE':
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
......@@ -26,26 +27,28 @@ else:
def test_shape_i():
x = cuda.ftensor3()
v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
f = theano.function([x],x.shape[1])
v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
f = theano.function([x], x.shape[1])
topo = f.maker.fgraph.toposort()
assert f(v)==4
if theano.config.mode!='FAST_COMPILE':
assert len(topo)==1
assert isinstance(topo[0].op,T.opt.Shape_i)
assert f(v) == 4
if theano.config.mode != 'FAST_COMPILE':
assert len(topo) == 1
assert isinstance(topo[0].op, T.opt.Shape_i)
def test_shape():
x = cuda.ftensor3()
v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
f = theano.function([x],x.shape)
v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
f = theano.function([x], x.shape)
topo = f.maker.fgraph.toposort()
assert numpy.all(f(v)==(3,4,5))
if theano.config.mode!='FAST_COMPILE':
assert len(topo)==4
assert isinstance(topo[0].op,T.opt.Shape_i)
assert isinstance(topo[1].op,T.opt.Shape_i)
assert isinstance(topo[2].op,T.opt.Shape_i)
assert isinstance(topo[3].op,T.opt.MakeVector)
assert numpy.all(f(v) == (3, 4, 5))
if theano.config.mode != 'FAST_COMPILE':
assert len(topo) == 4
assert isinstance(topo[0].op, T.opt.Shape_i)
assert isinstance(topo[1].op, T.opt.Shape_i)
assert isinstance(topo[2].op, T.opt.Shape_i)
assert isinstance(topo[3].op, T.opt.MakeVector)
def test_softmax_optimizations():
from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot
......@@ -66,16 +69,17 @@ def test_softmax_optimizations():
assert fgraph.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu
assert fgraph.outputs[0].owner.inputs[0].owner.inputs[0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias
def test_may_share_memory_cuda():
from theano.misc.may_share_memory import may_share_memory
a = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
b = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
na = numpy.zeros((3,4))
nb = numpy.zeros((3,4))
a = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
b = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
na = numpy.zeros((3, 4))
nb = numpy.zeros((3, 4))
va = a.view()
vb = b.view()
ra = a.reshape((4,3))
rb = b.reshape((4,3))
ra = a.reshape((4, 3))
rb = b.reshape((4, 3))
#can't test the transpose as ta._strides = is not implemented
#manual transpose of a
......@@ -84,25 +88,28 @@ def test_may_share_memory_cuda():
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size
for a_,b_,rep in [(a,a,True),(b,b,True),(a,b,False),
(a,na,False),(b,nb,False),(na,b,False),(nb,a,False),
(a,va,True),(b,vb,True),(va,b,False),(a,vb,False),
(a,ra,True),(b,rb,True),(ra,b,False),(a,rb,False),
for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
(a, na, False), (b, nb, False),
(na, b, False), (nb, a, False),
(a, va, True), (b, vb, True),
(va, b, False), (a, vb, False),
(a, ra, True), (b, rb, True),
(ra, b, False), (a, rb, False),
]:
assert may_share_memory(a_,b_)==rep
assert may_share_memory(b_,a_)==rep
assert may_share_memory(a_, b_) == rep
assert may_share_memory(b_, a_) == rep
#test that it raise error when needed.
for a_,b_,rep in [(a,(0,),False),(a,1,False),(a,None,False)]:
assert may_share_memory(a_,b_,False)==rep
assert may_share_memory(b_,a_,False)==rep
for a_, b_, rep in [(a, (0,), False), (a, 1, False), (a, None, False)]:
assert may_share_memory(a_, b_, False) == rep
assert may_share_memory(b_, a_, False) == rep
try:
may_share_memory(a_,b_)
may_share_memory(a_, b_)
raise Exception("An error was expected")
except TypeError:
pass
try:
may_share_memory(b_,a_)
may_share_memory(b_, a_)
raise Exception("An error was expected")
except TypeError:
pass
......@@ -127,3 +134,12 @@ def test_deepcopy():
out = f(a_v)
assert out is not a_v
assert numpy.allclose(numpy.asarray(a_v), numpy.asarray(out))
def test_get_diagonal_subtensor_view():
test_conv3d2d.test_get_diagonal_subtensor_view(wrap=cuda.CudaNdarray)
def test_conv3d():
test_conv3d2d.test_conv3d(mode=mode_with_gpu,
shared=cuda.shared_constructor)
......@@ -561,6 +561,11 @@ conv3D = Conv3D()
:note: The order of dimensions does not correspond to the one in `conv2d`.
This is for optimization.
:note: The GPU implementation is very slow. You are better to use
:func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` that is faster
on GPU.
"""
def computeH(V,W,b,d):
......
import theano
from theano.gradient import DisconnectedType
from theano.gof import Op, Apply
from theano import tensor
import theano.sandbox.cuda as cuda
def get_diagonal_subtensor_view(x, i0, i1):
"""Helper function for DiagonalSubtensor and
IncDiagonalSubtensor
:note: it return a partial view of x, not a partial copy.
"""
if x.shape[i0] < x.shape[i1]:
raise NotImplementedError('is this allowed?')
idx = [slice(None)] * x.ndim
idx[i0] = slice(x.shape[i1] - 1, None, None)
xview = x.__getitem__(tuple(idx))
strides = list(xview.strides)
strides[i1] -= strides[i0]
xview.strides = strides
return xview
class DiagonalSubtensor(Op):
"""Return a form a nd diagonal subtensor.
:param x: n-d tensor
:param i0: axis index in x
:param i1: axis index in x
:note: Work on the GPU.
``x`` is some n-dimensional tensor, but this Op only deals with a
matrix-shaped slice, using axes i0 and i1. Without loss of
generality, suppose that ``i0`` picks out our ``row`` dimension,
and i1 the ``column`` dimension.
So the relevant part of ``x`` is some matrix ``u``. Suppose it has 7 rows
and 4 columns::
[ 0 0 0 0 ]
[ 0 0 0 0 ]
[ 0 0 0 0 ]
[ 0 0 0 0 ]
[ 0 0 0 0 ]
[ 0 0 0 0 ]
The view returned by this function is also a matrix. It's a thick,
diagonal ``stripe`` across u that discards the lower left triangle
and the upper right triangle:
[ x 0 0 0 ]
[ x x 0 0 ]
[ x x x 0 ]
[ 0 x x x ]
[ 0 0 x x ]
[ 0 0 0 x ]
In this case the return value would be this view of shape 3x4. The
returned view has the same number of dimensions as the input
``x``, and the only difference is that the shape along dimension
``i0`` has been reduced by ``shape[i1] - 1`` because of the
triangles that got chopped out.
The NotImplementedError is meant to catch the case where shape[i0]
is too small for the stripe to reach across the matrix, in which
case it's not clear what this function should do. Maybe always
raise an error. I'd look back to the call site in the Conv3D to
see what's necessary at that point.
"""
def __str__(self):
if self.inplace:
return "%s{inplace}" % self.__class__.__name__
return "%s" % self.__class__.__name__
def __init__(self, inplace=False):
self.inplace = inplace
if inplace:
self.view_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other) and self.inplace == other.inplace
def __hash__(self):
return hash((type(self), self.inplace))
def make_node(self, x, i0, i1):
_i0 = tensor.as_tensor_variable(i0)
_i1 = tensor.as_tensor_variable(i1)
return Apply(self, [x, _i0, _i1], [x.type()])
def perform(self, node, inputs, output_storage):
xview = get_diagonal_subtensor_view(*inputs)
if self.inplace:
output_storage[0][0] = xview
else:
output_storage[0][0] = xview.copy()
def grad(self, inputs, g_outputs):
z = tensor.zeros_like(inputs[0])
gx = inc_diagonal_subtensor(z, inputs[1], inputs[2], g_outputs[0])
return [gx, DisconnectedType()(), DisconnectedType()()]
def connection_pattern(self, node):
rval = [[True], [False], [False]]
return rval
diagonal_subtensor = DiagonalSubtensor(False)
class IncDiagonalSubtensor(Op):
"""
The gradient of DiagonalSubtensor
"""
def __str__(self):
if self.inplace:
return "%s{inplace}" % self.__class__.__name__
return "%s" % self.__class__.__name__
def __init__(self, inplace=False):
self.inplace = inplace
if inplace:
self.destroy_map = {0: [0]}
def __eq__(self, other):
return type(self) == type(other) and self.inplace == other.inplace
def __hash__(self):
return hash((type(self), self.inplace))
def make_node(self, x, i0, i1, amt):
_i0 = tensor.as_tensor_variable(i0)
_i1 = tensor.as_tensor_variable(i1)
return Apply(self, [x, _i0, _i1, amt], [x.type()])
def perform(self, node, inputs, output_storage):
x, i0, i1, amt = inputs
if not self.inplace:
x = x.copy()
xview = get_diagonal_subtensor_view(x, i0, i1)
xview += amt
output_storage[0][0] = x
def grad(self, inputs, g_outputs):
x, i0, i1, amt = inputs
gy = g_outputs[0]
return [gy, DisconnectedType()(), DisconnectedType()(),
diagonal_subtensor(gy, i0, i1)]
def connection_pattern(self, node):
rval = [[True], [False], [False], [True]]
return rval
inc_diagonal_subtensor = IncDiagonalSubtensor(False)
def conv3d(signals, filters,
signals_shape=None, filters_shape=None,
border_mode='valid'):
"""Convolve spatio-temporal filters with a movie.
:param signals: timeseries of images whose pixels have color channels.
shape: [Ns, Ts, C, Hs, Ws]
:param filters: spatio-temporal filters
shape: [Nf, Tf, C, Hf, Wf]
:param signals_shape: None or a tuple/list with the shape of signals
:param filters_shape: None or a tuple/list with the shape of filters
:param border_mode: The only one tested is 'valid'.
:note: Work on the GPU.
"""
if isinstance(border_mode, str):
border_mode = (border_mode, border_mode, border_mode)
_signals_shape_5d = signals.shape if signals_shape is None else signals_shape
_filters_shape_5d = filters.shape if filters_shape is None else filters_shape
_signals_shape_4d = (
_signals_shape_5d[0] * _signals_shape_5d[1],
_signals_shape_5d[2],
_signals_shape_5d[3],
_signals_shape_5d[4],
)
_filters_shape_4d = (
_filters_shape_5d[0] * _filters_shape_5d[1],
_filters_shape_5d[2],
_filters_shape_5d[3],
_filters_shape_5d[4],
)
if border_mode[1] != border_mode[2]:
raise NotImplementedError('height and width bordermodes must match')
conv2d_signal_shape = _signals_shape_4d
conv2d_filter_shape = _filters_shape_4d
if signals_shape is None:
conv2d_signal_shape = None
if filters_shape is None:
conv2d_filter_shape = None
out_4d = tensor.nnet.conv2d(
signals.reshape(_signals_shape_4d),
filters.reshape(_filters_shape_4d),
image_shape=conv2d_signal_shape,
filter_shape=conv2d_filter_shape,
border_mode = border_mode[1]) # ignoring border_mode[2]
# reshape the output to restore its original size
# shape = Ns, Ts, Nf, Tf, W-Wf+1, H-Hf+1
if border_mode[1] == 'valid':
out_tmp = out_4d.reshape((
_signals_shape_5d[0], # Ns
_signals_shape_5d[1], # Ts
_filters_shape_5d[0], # Nf
_filters_shape_5d[1], # Tf
_signals_shape_5d[3] - _filters_shape_5d[3] + 1,
_signals_shape_5d[4] - _filters_shape_5d[4] + 1,
))
elif border_mode[1] == 'full':
out_tmp = out_4d.reshape((
_signals_shape_5d[0], # Ns
_signals_shape_5d[1], # Ts
_filters_shape_5d[0], # Nf
_filters_shape_5d[1], # Tf
_signals_shape_5d[3] + _filters_shape_5d[3] - 1,
_signals_shape_5d[4] + _filters_shape_5d[4] - 1,
))
elif border_mode[1] == 'same':
raise NotImplementedError()
else:
raise ValueError('invalid border mode', border_mode[1])
# now sum out along the Tf to get the output
# but we have to sum on a diagonal through the Tf and Ts submatrix.
if border_mode[0] == 'valid':
out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
elif border_mode[0] in ('full', 'same'):
raise NotImplementedError('sequence border mode', border_mode[0])
else:
raise ValueError('invalid border mode', border_mode[1])
return out_5d
def make_gpu_optimizer(op, to_gpu):
"""This function create optimizer that move some inputs to the GPU
for op that work on both CPU and GPU.
The op object is created by calling op(), so good default value
are needed.
We suppose the same op work with CPU and GPU inputs.
:param op: the op that support GPU inputs
:param to_gpu: a list of op inputs that are moved to the GPU.
"""
@theano.gof.local_optimizer([])
def local_to_gpu(node):
"""
op(host_from_gpu()) -> host_from_gpu(op)
gpu_from_host(op) -> op(gpu_from_host)
"""
if isinstance(node.op, op):
#op(host_from_gpu()) -> host_from_gpu(op)
#If any of the input that go on the GPU are on the GPU,
#move the op to the gpu.
if any(node.inputs[idx].owner and
isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu)
for idx in to_gpu):
new_inp = list(node.inputs)
for idx in to_gpu:
new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
return [cuda.host_from_gpu(op()(*new_inp))]
if node.op == cuda.gpu_from_host:
#gpu_from_host(op) -> op(gpu_from_host)
host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op,
op):
op_node = host_input.owner
new_inp = list(op_node.inputs)
for idx in to_gpu:
new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
return [op()(*new_inp)]
return False
local_to_gpu.__name__ = "local_to_gpu_" + op.__name__
cuda.opt.register_opt()(local_to_gpu)
if cuda.cuda_available:
make_gpu_optimizer(DiagonalSubtensor, [0])
make_gpu_optimizer(IncDiagonalSubtensor, [0, 3])
import time
import numpy
from scipy import ndimage
import theano
from theano.tensor.nnet.conv3d2d import *
import theano.tests.unittest_tools as utt
if theano.config.mode == 'FAST_COMPILE':
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
else:
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
def test_get_diagonal_subtensor_view(wrap=lambda a: a):
x = numpy.arange(20).reshape(5, 4).astype('float32')
x = wrap(x)
xv01 = get_diagonal_subtensor_view(x, 0, 1)
# test that it works in 2d
assert numpy.all(numpy.asarray(xv01) == [[12, 9, 6, 3], [16, 13, 10, 7]])
x = numpy.arange(24).reshape(4, 3, 2)
xv01 = get_diagonal_subtensor_view(x, 0, 1)
xv02 = get_diagonal_subtensor_view(x, 0, 2)
xv12 = get_diagonal_subtensor_view(x, 1, 2)
#print 'x', x
#print 'xv01', xv01
#print 'xv02', xv02
assert numpy.all(numpy.asarray(xv01) == [
[[12, 13], [8, 9], [4, 5]],
[[18, 19], [14, 15], [10, 11]]])
assert numpy.all(numpy.asarray(xv02) == [
[[6, 1], [8, 3], [10, 5]],
[[12, 7], [14, 9], [16, 11]],
[[18, 13], [20, 15], [22, 17]],
])
# diagonal views of each leading matrix is the same
# as the slices out of the diagonal view of the entire 3d tensor
for xi, xvi in zip(x, xv12):
assert numpy.all(xvi == get_diagonal_subtensor_view(xi, 0, 1))
def pyconv3d(signals, filters):
Ns, Ts, C, Hs, Ws = signals.shape
Nf, Tf, C, Hf, Wf = filters.shape
Tf2 = Tf//2
Hf2 = Hf//2
Wf2 = Wf//2
rval = numpy.zeros((Ns, Ts-Tf+1, Nf, Hs-Hf+1, Ws-Wf+1))
for ns in xrange(Ns):
for nf in xrange(Nf):
for c in xrange(C):
s_i = signals[ns,:,c,:,:]
f_i = filters[nf,:,c,:,:]
r_i = rval[ns, :, nf, :, :]
o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1)
#print s_i.shape, f_i.shape, r_i.shape, o_i.shape
r_i += o_i[Tf2:-Tf2, Hf2:-Hf2, Wf2:-Wf2]
return rval
def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
Nf, Tf, C, Hf, Wf = 32, 5 , 3, 5 , 5
signals = numpy.arange(Ns*Ts*C*Hs*Ws).reshape(Ns, Ts, C, Hs, Ws).astype('float32')
filters = numpy.arange(Nf*Tf*C*Hf*Wf).reshape(Nf, Tf, C, Hf, Wf).astype('float32')
t0 = time.time()
pyres = pyconv3d(signals, filters)
print time.time() - t0
s_signals = shared(signals)
s_filters = shared(filters)
s_output = shared(signals*0)
out = conv3d(s_signals, s_filters,
signals_shape=signals.shape,
filters_shape=filters.shape)
newconv3d = theano.function([], [],
updates={s_output: out},
mode=mode)
t0 = time.time()
newconv3d()
print time.time() - t0
utt.assert_allclose(pyres, s_output.get_value(borrow=True))
gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters])
gnewconv3d = theano.function([], [],
updates=[(s_filters, gfilters),
(s_signals, gsignals)],
mode=mode,
name='grad')
t0 = time.time()
gnewconv3d()
print 'grad', time.time() - t0
Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2
signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
utt.verify_grad(conv3d, [signals, filters])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论