Merge pull request #1454 from nouiz/conv3d2d

[MRG]Conv3d2d

Merge pull request #1454 from nouiz/conv3d2d
36694a6d · Pascal Lamblin · de5e06e7 · 1a3a477e · 36694a6d · 36694a6d
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,6 +28,7 @@ env:
  - PART="-e test_basic.py theano/tensor/tests"
 script:
  - "if [ `expr \"$PART\" : '.*sparse'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
+  - "if [ `expr \"$PART\" : '.*nnet'` -gt \"0\" ]; then pip install scipy==0.8 --use-mirrors; fi"
  - export THEANO_FLAGS=warn.ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise
  - python --version
  - uname -a

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -16,9 +16,6 @@
    present in convolutional neural networks (where filters are 3D and pool
    over several input channels).

-    The project `TheanoConv3d2d <https://github.com/jaberg/TheanoConv3d2d>`_
-    is probably faster then the Conv3d documented here.
-
 .. module:: conv
   :platform: Unix, Windows
   :synopsis: ops for signal processing
@@ -31,6 +28,21 @@ TODO: Give examples for how to use these things! They are pretty complicated.
    - :func:`signal.conv2d <theano.tensor.signal.conv.conv2d>`.
    - :func:`nnet.conv2d <theano.tensor.nnet.conv.conv2d>`.
    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`.
+    - :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>`
+      Another conv3d implementation that use the conv2d with data reshaping.
+      It is faster in some case then conv3d, specificaly on the GPU.
+    - `Faster conv2d <http://deeplearning.net/software/pylearn2/library/alex.html>`_
+
+      This is in Pylearn2, not very documented and use a different
+      memory layout for the input.  It is important to have the input
+      in the native memory layout, and not use dimshuffle on the
+      inputs, otherwise you loose much of the speed up. So this is not
+      a drop in replacement of conv2d.
+
+      Normally those are called from the `linear transfrom
+      <http://deeplearning.net/software/pylearn2/library/linear.html>`_
+      implementation.

 .. autofunction:: theano.tensor.nnet.conv.conv2d
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
+.. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
--- a/theano/sandbox/cuda/tests/test_tensor_op.py
+++ b/theano/sandbox/cuda/tests/test_tensor_op.py
@@ -12,11 +12,12 @@ import theano.tensor as T

 # Skip test if cuda_ndarray is not available.
 import theano.sandbox.cuda as cuda
+from theano.tensor.nnet.tests import test_conv3d2d
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')


-if theano.config.mode=='FAST_COMPILE':
+if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
 else:
@@ -26,26 +27,28 @@ else:

 def test_shape_i():
    x = cuda.ftensor3()
-    v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
-    f = theano.function([x],x.shape[1])
+    v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
+    f = theano.function([x], x.shape[1])
    topo = f.maker.fgraph.toposort()
-    assert f(v)==4
-    if theano.config.mode!='FAST_COMPILE':
-        assert len(topo)==1
-        assert isinstance(topo[0].op,T.opt.Shape_i)
+    assert f(v) == 4
+    if theano.config.mode != 'FAST_COMPILE':
+        assert len(topo) == 1
+        assert isinstance(topo[0].op, T.opt.Shape_i)
+

 def test_shape():
    x = cuda.ftensor3()
-    v = cuda.CudaNdarray(numpy.zeros((3,4,5),dtype='float32'))
-    f = theano.function([x],x.shape)
+    v = cuda.CudaNdarray(numpy.zeros((3, 4, 5), dtype='float32'))
+    f = theano.function([x], x.shape)
    topo = f.maker.fgraph.toposort()
-    assert numpy.all(f(v)==(3,4,5))
-    if theano.config.mode!='FAST_COMPILE':
-        assert len(topo)==4
-        assert isinstance(topo[0].op,T.opt.Shape_i)
-        assert isinstance(topo[1].op,T.opt.Shape_i)
-        assert isinstance(topo[2].op,T.opt.Shape_i)
-        assert isinstance(topo[3].op,T.opt.MakeVector)
+    assert numpy.all(f(v) == (3, 4, 5))
+    if theano.config.mode != 'FAST_COMPILE':
+        assert len(topo) == 4
+        assert isinstance(topo[0].op, T.opt.Shape_i)
+        assert isinstance(topo[1].op, T.opt.Shape_i)
+        assert isinstance(topo[2].op, T.opt.Shape_i)
+        assert isinstance(topo[3].op, T.opt.MakeVector)
+

 def test_softmax_optimizations():
    from theano.tensor.nnet.nnet import softmax, crossentropy_categorical_1hot
@@ -66,16 +69,17 @@ def test_softmax_optimizations():
    assert fgraph.outputs[0].owner.inputs[0].owner.op == cuda.host_from_gpu
    assert fgraph.outputs[0].owner.inputs[0].owner.inputs[0].owner.op == cuda.nnet.gpu_crossentropy_softmax_argmax_1hot_with_bias

+
 def test_may_share_memory_cuda():
    from theano.misc.may_share_memory import may_share_memory
-    a = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
-    b = cuda.CudaNdarray(numpy.zeros((3,4),dtype='float32'))
-    na = numpy.zeros((3,4))
-    nb = numpy.zeros((3,4))
+    a = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
+    b = cuda.CudaNdarray(numpy.zeros((3, 4), dtype='float32'))
+    na = numpy.zeros((3, 4))
+    nb = numpy.zeros((3, 4))
    va = a.view()
    vb = b.view()
-    ra = a.reshape((4,3))
-    rb = b.reshape((4,3))
+    ra = a.reshape((4, 3))
+    rb = b.reshape((4, 3))

    #can't test the transpose as ta._strides = is not implemented
    #manual transpose of a
@@ -84,25 +88,28 @@ def test_may_share_memory_cuda():
    #elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
    #ta.gpudata += ta.size*elem_size

-    for a_,b_,rep in [(a,a,True),(b,b,True),(a,b,False),
-                      (a,na,False),(b,nb,False),(na,b,False),(nb,a,False),
-                      (a,va,True),(b,vb,True),(va,b,False),(a,vb,False),
-                      (a,ra,True),(b,rb,True),(ra,b,False),(a,rb,False),
+    for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
+                        (a, na, False), (b, nb, False),
+                        (na, b, False), (nb, a, False),
+                        (a, va, True), (b, vb, True),
+                        (va, b, False), (a, vb, False),
+                        (a, ra, True), (b, rb, True),
+                        (ra, b, False), (a, rb, False),
                      ]:
-        assert may_share_memory(a_,b_)==rep
-        assert may_share_memory(b_,a_)==rep
+        assert may_share_memory(a_, b_) == rep
+        assert may_share_memory(b_, a_) == rep

    #test that it raise error when needed.
-    for a_,b_,rep in [(a,(0,),False),(a,1,False),(a,None,False)]:
-        assert may_share_memory(a_,b_,False)==rep
-        assert may_share_memory(b_,a_,False)==rep
+    for a_, b_, rep in [(a, (0,), False), (a, 1, False), (a, None, False)]:
+        assert may_share_memory(a_, b_, False) == rep
+        assert may_share_memory(b_, a_, False) == rep
        try:
-            may_share_memory(a_,b_)
+            may_share_memory(a_, b_)
            raise Exception("An error was expected")
        except TypeError:
            pass
        try:
-            may_share_memory(b_,a_)
+            may_share_memory(b_, a_)
            raise Exception("An error was expected")
        except TypeError:
            pass
@@ -127,3 +134,12 @@ def test_deepcopy():
    out = f(a_v)
    assert out is not a_v
    assert numpy.allclose(numpy.asarray(a_v), numpy.asarray(out))
+
+
+def test_get_diagonal_subtensor_view():
+    test_conv3d2d.test_get_diagonal_subtensor_view(wrap=cuda.CudaNdarray)
+
+
+def test_conv3d():
+    test_conv3d2d.test_conv3d(mode=mode_with_gpu,
+                              shared=cuda.shared_constructor)
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -561,6 +561,11 @@ conv3D = Conv3D()

 :note: The order of dimensions does not correspond to the one in `conv2d`.
       This is for optimization.
+
+:note: The GPU implementation is very slow. You are better to use
+    :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` that is faster
+    on GPU.
+
 """

 def computeH(V,W,b,d):

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
+import time
+
+import numpy
+from scipy import ndimage
+import theano
+from theano.tensor.nnet.conv3d2d import *
+import theano.tests.unittest_tools as utt
+
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
+else:
+    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+
+
+def test_get_diagonal_subtensor_view(wrap=lambda a: a):
+    x = numpy.arange(20).reshape(5, 4).astype('float32')
+    x = wrap(x)
+    xv01 = get_diagonal_subtensor_view(x, 0, 1)
+
+    # test that it works in 2d
+    assert numpy.all(numpy.asarray(xv01) == [[12, 9, 6, 3], [16, 13, 10, 7]])
+
+    x = numpy.arange(24).reshape(4, 3, 2)
+    xv01 = get_diagonal_subtensor_view(x, 0, 1)
+    xv02 = get_diagonal_subtensor_view(x, 0, 2)
+    xv12 = get_diagonal_subtensor_view(x, 1, 2)
+
+    #print 'x', x
+    #print 'xv01', xv01
+    #print 'xv02', xv02
+    assert numpy.all(numpy.asarray(xv01) == [
+        [[12, 13], [8, 9], [4, 5]],
+        [[18, 19], [14, 15], [10, 11]]])
+
+    assert numpy.all(numpy.asarray(xv02) == [
+        [[6, 1], [8, 3], [10, 5]],
+        [[12, 7], [14, 9], [16, 11]],
+        [[18, 13], [20, 15], [22, 17]],
+        ])
+
+    # diagonal views of each leading matrix is the same
+    # as the slices out of the diagonal view of the entire 3d tensor
+    for xi, xvi in zip(x, xv12):
+        assert numpy.all(xvi == get_diagonal_subtensor_view(xi, 0, 1))
+
+
+def pyconv3d(signals, filters):
+    Ns, Ts, C, Hs, Ws = signals.shape
+    Nf, Tf, C, Hf, Wf = filters.shape
+
+    Tf2 = Tf//2
+    Hf2 = Hf//2
+    Wf2 = Wf//2
+
+    rval = numpy.zeros((Ns, Ts-Tf+1, Nf, Hs-Hf+1, Ws-Wf+1))
+    for ns in xrange(Ns):
+        for nf in xrange(Nf):
+            for c in xrange(C):
+                s_i = signals[ns,:,c,:,:]
+                f_i = filters[nf,:,c,:,:]
+                r_i = rval[ns, :, nf, :, :]
+                o_i = ndimage.convolve(s_i, f_i, mode='constant', cval=1)
+                #print s_i.shape, f_i.shape, r_i.shape, o_i.shape
+                r_i += o_i[Tf2:-Tf2, Hf2:-Hf2, Wf2:-Wf2]
+    return rval
+
+
+def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared):
+
+    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
+    Nf, Tf, C, Hf, Wf = 32, 5 , 3, 5 , 5
+
+    signals = numpy.arange(Ns*Ts*C*Hs*Ws).reshape(Ns, Ts, C, Hs, Ws).astype('float32')
+    filters = numpy.arange(Nf*Tf*C*Hf*Wf).reshape(Nf, Tf, C, Hf, Wf).astype('float32')
+
+    t0 = time.time()
+    pyres = pyconv3d(signals, filters)
+    print time.time() - t0
+
+    s_signals = shared(signals)
+    s_filters = shared(filters)
+    s_output = shared(signals*0)
+
+    out = conv3d(s_signals, s_filters,
+                 signals_shape=signals.shape,
+                 filters_shape=filters.shape)
+
+    newconv3d = theano.function([], [],
+                                updates={s_output: out},
+                                mode=mode)
+
+    t0 = time.time()
+    newconv3d()
+    print time.time() - t0
+    utt.assert_allclose(pyres, s_output.get_value(borrow=True))
+    gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters])
+    gnewconv3d = theano.function([], [],
+                                 updates=[(s_filters, gfilters),
+                                          (s_signals, gsignals)],
+                                 mode=mode,
+                                 name='grad')
+
+    t0 = time.time()
+    gnewconv3d()
+    print 'grad', time.time() - t0
+
+    Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5
+    Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2
+
+    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
+    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
+    utt.verify_grad(conv3d, [signals, filters])