Merge pull request #1454 from nouiz/conv3d2d

[MRG]Conv3d2d

Merge pull request #1454 from nouiz/conv3d2d
36694a6d · Pascal Lamblin · de5e06e7 · 1a3a477e · 36694a6d · 36694a6d
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,6 +28,7 @@ env:
  - PART="-e test_basic.py theano/tensor/tests"
 script:
  - "if [ `expr \"$PART\" : '.*sparse'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
+  - "if [ `expr \"$PART\" : '.*nnet'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
  - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
  - python --version
  - uname -a

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -16,9 +16,6 @@
    present in convolutional neural networks (where filters are 3D and pool
    over several input channels).

-    The project `TheanoConv3d2d <https://github.com/jaberg/TheanoConv3d2d>`_
-    is probably faster then the Conv3d documented here.
-
 .. module:: conv
   :platform: Unix, Windows
   :synopsis: ops for signal processing
@@ -31,6 +28,21 @@ TODO: Give examples for how to use these things! They are pretty complicated.
    - :func:`signal.conv2d <theano.tensor.signal.conv.conv2d>`.
    - :func:`nnet.conv2d <theano.tensor.nnet.conv.conv2d>`.
    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`.
+    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
+      Another conv3d implementation that use the conv2d with data reshaping.
+      It is faster in some case then conv3d, specificaly on the GPU.
+    - `Faster conv2d <http://deeplearning.net/software/pylearn2/library/alex.html>`_
+
+      This is in Pylearn2, not very documented and use a different
+      memory layout for the input.  It is important to have the input
+      in the native memory layout, and not use dimshuffle on the
+      inputs, otherwise you loose much of the speed up. So this is not
+      a drop in replacement of conv2d.
+
+      Normally those are called from the `linear transfrom
+      <http://deeplearning.net/software/pylearn2/library/linear.html>`_
+      implementation.

 .. autofunction:: theano.tensor.nnet.conv.conv2d
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
+.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
@@ -12,11 +12,12 @@ import theano.tensor as T

 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
+from theano.tensor.nnet.tests import test_conv3d2d
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')


-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
 else:
@@ -26,26 +27,28 @@ else:

 def test_shape_i():
    x = cuda.ftensor3()
-    v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
-    f = theano.function([x],x.shape[1])
+    v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
+    f = theano.function([x], x.shape[1])
    topo = f.maker.fgraph.toposort()
-    assert f(v)==4
-    if theano.config.mode!='FAST_COMPILE':
-        assert len(topo)==1
-        assert isinstance(topo[0].op,T.opt.Shape_i)
+    assert f(v) == 4
+    if theano.config.mode != 'FAST_COMPILE':
+        assert len(topo) == 1
+        assert isinstance(topo[0].op, T.opt.Shape_i)
+

 def test_shape():
    x = cuda.ftensor3()
-    v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
-    f = theano.function([x],x.shape)
+    v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
+    f = theano.function([x], x.shape)
    topo = f.maker.fgraph.toposort()
-    assert numpy.all(f(v)==(3,4,5))
-    if theano.config.mode!='FAST_COMPILE':
-        assert len(topo)==4
-        assert isinstance(topo[0].op,T.opt.Shape_i)
-        assert isinstance(topo[1].op,T.opt.Shape_i)
-        assert isinstance(topo[2].op,T.opt.Shape_i)
-        assert isinstance(topo[3].op,T.opt.MakeVector)
+    assert numpy.all(f(v) == (3, 4, 5))
+    if theano.config.mode != 'FAST_COMPILE':
+        assert len(topo) == 4
+        assert isinstance(topo[0].op, T.opt.Shape_i)
+        assert isinstance(topo[1].op, T.opt.Shape_i)
+        assert isinstance(topo[2].op, T.opt.Shape_i)
+        assert isinstance(topo[3].op, T.opt.MakeVector)
+

 def test_softmax_optimizations():
    from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot
@@ -66,16 +69,17 @@ def test_softmax_optimizations():
    assert fgraph.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu
    assert fgraph.outputs[0].owner.inputs[0].owner.inputs[0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias

+
 def test_may_share_memory_cuda():
    from theano.misc.may_share_memory import may_share_memory
-    a = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
-    b = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
-    na = numpy.zeros((3,4))
-    nb = numpy.zeros((3,4))
+    a = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
+    b = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
+    na = numpy.zeros((3, 4))
+    nb = numpy.zeros((3, 4))
    va = a.view()
    vb = b.view()
-    ra = a.reshape((4,3))
-    rb = b.reshape((4,3))
+    ra = a.reshape((4, 3))
+    rb = b.reshape((4, 3))

    #can't test the transpose as ta._strides = is not implemented
    #manual transpose of a
@@ -84,25 +88,28 @@ def test_may_share_memory_cuda():
    #elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
    #ta.gpudata += ta.size*elem_size

-    for a_,b_,rep in [(a,a,True),(b,b,True),(a,b,False),
-                      (a,na,False),(b,nb,False),(na,b,False),(nb,a,False),
-                      (a,va,True),(b,vb,True),(va,b,False),(a,vb,False),
-                      (a,ra,True),(b,rb,True),(ra,b,False),(a,rb,False),
+    for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
+                        (a, na, False), (b, nb, False),
+                        (na, b, False), (nb, a, False),
+                        (a, va, True), (b, vb, True),
+                        (va, b, False), (a, vb, False),
+                        (a, ra, True), (b, rb, True),
+                        (ra, b, False), (a, rb, False),
                      ]:
-        assert may_share_memory(a_,b_)==rep
-        assert may_share_memory(b_,a_)==rep
+        assert may_share_memory(a_, b_) == rep
+        assert may_share_memory(b_, a_) == rep

    #test that it raise error when needed.
-    for a_,b_,rep in [(a,(0,),False),(a,1,False),(a,None,False)]:
-        assert may_share_memory(a_,b_,False)==rep
-        assert may_share_memory(b_,a_,False)==rep
+    for a_, b_, rep in [(a, (0,), False), (a, 1, False), (a, None, False)]:
+        assert may_share_memory(a_, b_, False) == rep
+        assert may_share_memory(b_, a_, False) == rep
        try:
-            may_share_memory(a_,b_)
+            may_share_memory(a_, b_)
            raise Exception("An error was expected")
        except TypeError:
            pass
        try:
-            may_share_memory(b_,a_)
+            may_share_memory(b_, a_)
            raise Exception("An error was expected")
        except TypeError:
            pass
@@ -127,3 +134,12 @@ def test_deepcopy():
    out = f(a_v)
    assert out is not a_v
    assert numpy.allclose(numpy.asarray(a_v), numpy.asarray(out))
+
+
+def test_get_diagonal_subtensor_view():
+    test_conv3d2d.test_get_diagonal_subtensor_view(wrap=cuda.CudaNdarray)
+
+
+def test_conv3d():
+    test_conv3d2d.test_conv3d(mode=mode_with_gpu,
+                              shared=cuda.shared_constructor)
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -561,6 +561,11 @@ conv3D = Conv3D()

 :note: The order of dimensions does not correspond to the one in `conv2d`.
       This is for optimization.
+
+:note: The GPU implementation is very slow. You are better to use
+    :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` that is faster
+    on GPU.
+
 """

 def computeH(V,W,b,d):

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
+import theano
+from theano.gradient import DisconnectedType
+from theano.gof import Op, Apply
+from theano import tensor
+import theano.sandbox.cuda as cuda
+
+
+def get_diagonal_subtensor_view(x, i0, i1):
+    """Helper function for DiagonalSubtensor and
+    IncDiagonalSubtensor
+
+    :note: it return a partial view of x, not a partial copy.
+    """
+
+    if x.shape[i0] < x.shape[i1]:
+        raise NotImplementedError('is this allowed?')
+    idx = [slice(None)] * x.ndim
+    idx[i0] = slice(x.shape[i1] - 1, None, None)
+    xview = x.__getitem__(tuple(idx))
+    strides = list(xview.strides)
+    strides[i1] -= strides[i0]
+    xview.strides = strides
+    return xview
+
+
+class DiagonalSubtensor(Op):
+    """Return a form a nd diagonal subtensor.
+
+    :param x: n-d tensor
+    :param i0: axis index in x
+    :param i1: axis index in x
+    :note: Work on the GPU.
+
+    ``x`` is some n-dimensional tensor, but this Op only deals with a
+    matrix-shaped slice, using axes i0 and i1. Without loss of
+    generality, suppose that ``i0`` picks out our ``row`` dimension,
+    and i1 the ``column`` dimension.
+
+    So the relevant part of ``x`` is some matrix ``u``. Suppose it has 7 rows
+    and 4 columns::
+
+        [ 0 0 0 0 ]
+        [ 0 0 0 0 ]
+        [ 0 0 0 0 ]
+        [ 0 0 0 0 ]
+        [ 0 0 0 0 ]
+        [ 0 0 0 0 ]
+
+    The view returned by this function is also a matrix. It's a thick,
+    diagonal ``stripe`` across u that discards the lower left triangle
+    and the upper right triangle:
+
+        [ x 0 0 0 ]
+        [ x x 0 0 ]
+        [ x x x 0 ]
+        [ 0 x x x ]
+        [ 0 0 x x ]
+        [ 0 0 0 x ]
+
+    In this case the return value would be this view of shape 3x4. The
+    returned view has the same number of dimensions as the input
+    ``x``, and the only difference is that the shape along dimension
+    ``i0`` has been reduced by ``shape[i1] - 1`` because of the
+    triangles that got chopped out.
+
+    The NotImplementedError is meant to catch the case where shape[i0]
+    is too small for the stripe to reach across the matrix, in which
+    case it's not clear what this function should do. Maybe always
+    raise an error. I'd look back to the call site in the Conv3D to
+    see what's necessary at that point.
+
+    """
+    def __str__(self):
+        if self.inplace:
+            return "%s{inplace}" % self.__class__.__name__
+        return "%s" % self.__class__.__name__
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if inplace:
+            self.view_map = {0: [0]}
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.inplace == other.inplace
+
+    def __hash__(self):
+        return hash((type(self), self.inplace))
+
+    def make_node(self, x, i0, i1):
+        _i0 = tensor.as_tensor_variable(i0)
+        _i1 = tensor.as_tensor_variable(i1)
+        return Apply(self, [x, _i0, _i1], [x.type()])
+
+    def perform(self, node, inputs, output_storage):
+        xview = get_diagonal_subtensor_view(*inputs)
+        if self.inplace:
+            output_storage[0][0] = xview
+        else:
+            output_storage[0][0] = xview.copy()
+
+    def grad(self, inputs, g_outputs):
+        z = tensor.zeros_like(inputs[0])
+        gx = inc_diagonal_subtensor(z, inputs[1], inputs[2], g_outputs[0])
+        return [gx, DisconnectedType()(), DisconnectedType()()]
+
+    def connection_pattern(self, node):
+        rval = [[True], [False], [False]]
+        return rval
+
+diagonal_subtensor = DiagonalSubtensor(False)
+
+
+class IncDiagonalSubtensor(Op):
+    """
+    The gradient of DiagonalSubtensor
+    """
+    def __str__(self):
+        if self.inplace:
+            return "%s{inplace}" % self.__class__.__name__
+        return "%s" % self.__class__.__name__
+
+    def __init__(self, inplace=False):
+        self.inplace = inplace
+        if inplace:
+            self.destroy_map = {0: [0]}
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self.inplace == other.inplace
+
+    def __hash__(self):
+        return hash((type(self), self.inplace))
+
+    def make_node(self, x, i0, i1, amt):
+        _i0 = tensor.as_tensor_variable(i0)
+        _i1 = tensor.as_tensor_variable(i1)
+        return Apply(self, [x, _i0, _i1, amt], [x.type()])
+
+    def perform(self, node, inputs, output_storage):
+        x, i0, i1, amt = inputs
+        if not self.inplace:
+            x = x.copy()
+        xview = get_diagonal_subtensor_view(x, i0, i1)
+        xview += amt
+        output_storage[0][0] = x
+
+    def grad(self, inputs, g_outputs):
+        x, i0, i1, amt = inputs
+        gy = g_outputs[0]
+        return [gy, DisconnectedType()(), DisconnectedType()(),
+                diagonal_subtensor(gy, i0, i1)]
+
+    def connection_pattern(self, node):
+        rval = [[True], [False], [False], [True]]
+        return rval
+inc_diagonal_subtensor = IncDiagonalSubtensor(False)
+
+
+def conv3d(signals, filters,
+           signals_shape=None, filters_shape=None,
+           border_mode='valid'):
+    """Convolve spatio-temporal filters with a movie.
+
+    :param signals: timeseries of images whose pixels have color channels.
+            shape: [Ns, Ts, C, Hs, Ws]
+    :param filters: spatio-temporal filters
+            shape: [Nf, Tf, C, Hf, Wf]
+    :param signals_shape: None or a tuple/list with the shape of signals
+    :param filters_shape: None or a tuple/list with the shape of filters
+    :param border_mode: The only one tested is 'valid'.
+
+    :note: Work on the GPU.
+    """
+
+    if isinstance(border_mode, str):
+        border_mode = (border_mode, border_mode, border_mode)
+
+    _signals_shape_5d = signals.shape if signals_shape is None else signals_shape
+    _filters_shape_5d = filters.shape if filters_shape is None else filters_shape
+
+    _signals_shape_4d = (
+        _signals_shape_5d[0] * _signals_shape_5d[1],
+        _signals_shape_5d[2],
+        _signals_shape_5d[3],
+        _signals_shape_5d[4],
+        )
+    _filters_shape_4d = (
+        _filters_shape_5d[0] * _filters_shape_5d[1],
+        _filters_shape_5d[2],
+        _filters_shape_5d[3],
+        _filters_shape_5d[4],
+        )
+
+    if border_mode[1] != border_mode[2]:
+        raise NotImplementedError('height and width bordermodes must match')
+    conv2d_signal_shape = _signals_shape_4d
+    conv2d_filter_shape = _filters_shape_4d
+    if signals_shape is None:
+        conv2d_signal_shape = None
+    if filters_shape is None:
+        conv2d_filter_shape = None
+
+    out_4d = tensor.nnet.conv2d(
+        signals.reshape(_signals_shape_4d),
+        filters.reshape(_filters_shape_4d),
+        image_shape=conv2d_signal_shape,
+        filter_shape=conv2d_filter_shape,
+        border_mode = border_mode[1])  # ignoring border_mode[2]
+
+    # reshape the output to restore its original size
+    # shape = Ns, Ts, Nf, Tf, W-Wf+1, H-Hf+1
+    if border_mode[1] == 'valid':
+        out_tmp = out_4d.reshape((
+            _signals_shape_5d[0],  # Ns
+            _signals_shape_5d[1],  # Ts
+            _filters_shape_5d[0],  # Nf
+            _filters_shape_5d[1],  # Tf
+            _signals_shape_5d[3] - _filters_shape_5d[3] + 1,
+            _signals_shape_5d[4] - _filters_shape_5d[4] + 1,
+            ))
+    elif border_mode[1] == 'full':
+        out_tmp = out_4d.reshape((
+            _signals_shape_5d[0],  # Ns
+            _signals_shape_5d[1],  # Ts
+            _filters_shape_5d[0],  # Nf
+            _filters_shape_5d[1],  # Tf
+            _signals_shape_5d[3] + _filters_shape_5d[3] - 1,
+            _signals_shape_5d[4] + _filters_shape_5d[4] - 1,
+            ))
+    elif border_mode[1] == 'same':
+        raise NotImplementedError()
+    else:
+        raise ValueError('invalid border mode', border_mode[1])
+
+    # now sum out along the Tf to get the output
+    # but we have to sum on a diagonal through the Tf and Ts submatrix.
+    if border_mode[0] == 'valid':
+        out_5d = diagonal_subtensor(out_tmp, 1, 3).sum(axis=3)
+    elif border_mode[0] in ('full', 'same'):
+        raise NotImplementedError('sequence border mode', border_mode[0])
+    else:
+        raise ValueError('invalid border mode', border_mode[1])
+    return out_5d
+
+
+def make_gpu_optimizer(op, to_gpu):
+    """This function create optimizer that move some inputs to the GPU
+    for op that work on both CPU and GPU.
+
+    The op object is created by calling op(), so good default value
+    are needed.
+
+    We suppose the same op work with CPU and GPU inputs.
+
+    :param op: the op that support GPU inputs
+    :param to_gpu: a list of op inputs that are moved to the GPU.
+
+    """
+    @theano.gof.local_optimizer([])
+    def local_to_gpu(node):
+        """
+        op(host_from_gpu()) -> host_from_gpu(op)
+        gpu_from_host(op) -> op(gpu_from_host)
+        """
+        if isinstance(node.op, op):
+            #op(host_from_gpu()) -> host_from_gpu(op)
+            #If any of the input that go on the GPU are on the GPU,
+            #move the op to the gpu.
+            if any(node.inputs[idx].owner and
+                   isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu)
+                   for idx in to_gpu):
+                new_inp = list(node.inputs)
+                for idx in to_gpu:
+                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
+                return [cuda.host_from_gpu(op()(*new_inp))]
+        if node.op == cuda.gpu_from_host:
+            #gpu_from_host(op) -> op(gpu_from_host)
+            host_input = node.inputs[0]
+            if host_input.owner and isinstance(host_input.owner.op,
+                                               op):
+                op_node = host_input.owner
+                new_inp = list(op_node.inputs)
+                for idx in to_gpu:
+                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
+                return [op()(*new_inp)]
+        return False
+    local_to_gpu.__name__ = "local_to_gpu_" + op.__name__
+    cuda.opt.register_opt()(local_to_gpu)
+
+if cuda.cuda_available:
+    make_gpu_optimizer(DiagonalSubtensor, [0])
+    make_gpu_optimizer(IncDiagonalSubtensor, [0, 3])
--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
+import time
+
+import numpy
+from scipy import ndimage
+import theano
+from theano.tensor.nnet.conv3d2d import *
+import theano.tests.unittest_tools as utt
+
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+else:
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+
+
+def test_get_diagonal_subtensor_view(wrap=lambda a: a):
+    x = numpy.arange(20).reshape(5, 4).astype('float32')
+    x = wrap(x)
+    xv01 = get_diagonal_subtensor_view(x, 0, 1)
+
+    # test that it works in 2d
+    assert numpy.all(numpy.asarray(xv01) == [[12, 9, 6, 3], [16, 13, 10, 7]])
+
+    x = numpy.arange(24).reshape(4, 3, 2)
+    xv01 = get_diagonal_subtensor_view(x, 0, 1)
+    xv02 = get_diagonal_subtensor_view(x, 0, 2)
+    xv12 = get_diagonal_subtensor_view(x, 1, 2)
+
+    #print 'x', x
+    #print 'xv01', xv01
+    #print 'xv02', xv02
+    assert numpy.all(numpy.asarray(xv01) == [
+        [[12, 13], [8, 9], [4, 5]],
+        [[18, 19], [14, 15], [10, 11]]])
+
+    assert numpy.all(numpy.asarray(xv02) == [
+        [[6, 1], [8, 3], [10, 5]],
+        [[12, 7], [14, 9], [16, 11]],
+        [[18, 13], [20, 15], [22, 17]],
+        ])
+
+    # diagonal views of each leading matrix is the same
+    # as the slices out of the diagonal view of the entire 3d tensor
+    for xi, xvi in zip(x, xv12):
+        assert numpy.all(xvi == get_diagonal_subtensor_view(xi, 0, 1))
+
+
+def pyconv3d(signals, filters):
+    Ns, Ts, C, Hs, Ws = signals.shape
+    Nf, Tf, C, Hf, Wf = filters.shape
+
+    Tf2 = Tf//2
+    Hf2 = Hf//2
+    Wf2 = Wf//2
+
+    rval = numpy.zeros((Ns, Ts-Tf+1, Nf, Hs-Hf+1, Ws-Wf+1))
+    for ns in xrange(Ns):
+        for nf in xrange(Nf):
+            for c in xrange(C):
+                s_i = signals[ns,:,c,:,:]
+                f_i = filters[nf,:,c,:,:]
+                r_i = rval[ns, :, nf, :, :]
+                o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1)
+                #print s_i.shape, f_i.shape, r_i.shape, o_i.shape
+                r_i += o_i[Tf2:-Tf2, Hf2:-Hf2, Wf2:-Wf2]
+    return rval
+
+
+def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
+
+    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
+    Nf, Tf, C, Hf, Wf = 32, 5 , 3, 5 , 5
+
+    signals = numpy.arange(Ns*Ts*C*Hs*Ws).reshape(Ns, Ts, C, Hs, Ws).astype('float32')
+    filters = numpy.arange(Nf*Tf*C*Hf*Wf).reshape(Nf, Tf, C, Hf, Wf).astype('float32')
+
+    t0 = time.time()
+    pyres = pyconv3d(signals, filters)
+    print time.time() - t0
+
+    s_signals = shared(signals)
+    s_filters = shared(filters)
+    s_output = shared(signals*0)
+
+    out = conv3d(s_signals, s_filters,
+                 signals_shape=signals.shape,
+                 filters_shape=filters.shape)
+
+    newconv3d = theano.function([], [],
+                                updates={s_output: out},
+                                mode=mode)
+
+    t0 = time.time()
+    newconv3d()
+    print time.time() - t0
+    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
+    gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters])
+    gnewconv3d = theano.function([], [],
+                                 updates=[(s_filters, gfilters),
+                                          (s_signals, gsignals)],
+                                 mode=mode,
+                                 name='grad')
+
+    t0 = time.time()
+    gnewconv3d()
+    print 'grad', time.time() - t0
+
+    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
+    Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2
+
+    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
+    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
+    utt.verify_grad(conv3d, [signals, filters])