Merge pull request #6252 from Faruk-Ahmed/conv3d

phase out outdated conv3d

Merge pull request #6252 from Faruk-Ahmed/conv3d
d844e6c1 · Frédéric Bastien · GitHub · 4747cf44 · e3853c84 · d844e6c1
--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -125,9 +125,6 @@ TODO: Give examples on how to use these things! They are pretty complicated.
      ``THEANO_FLAGS=optimizer_excluding=conv_dnn`` in your environment.
      As dnn_conv has a gradient defined, you can also use it manually.
 - Implemented operators for neural network 3D / video convolution:
-    - :func:`conv3D <theano.tensor.nnet.Conv3D.conv3D>`
-      3D Convolution applying multi-channel 3D filters to batches of
-      multi-channel 3D images. It does not flip the kernel.
    - :func:`GpuCorr3dMM <theano.gpuarray.blas.GpuCorr3dMM>`
      This is a GPU-only 3d correlation relying on a Toeplitz matrix
      and gemm implementation (see :func:`GpuCorrMM <theano.sandbox.cuda.blas.GpuCorrMM>`)
@@ -168,7 +165,6 @@ TODO: Give examples on how to use these things! They are pretty complicated.
 .. autofunction:: theano.tensor.nnet.conv2d
 .. autofunction:: theano.tensor.nnet.conv2d_transpose
 .. autofunction:: theano.tensor.nnet.conv3d
-.. autofunction:: theano.tensor.nnet.Conv3D.conv3D
 .. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
 .. autofunction:: theano.tensor.nnet.conv.conv2d

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-from six.moves import xrange
-import theano
-from theano.tensor import basic as T
-# from util import strutil
-from theano.tensor.blas_headers import blas_header_text, blas_header_version
-from theano.tensor.blas import ldflags
-from theano.misc import strutil
-from theano.gradient import grad_undefined
-# Note: not a true convolution because we don't bother with flipping the kernel
-# An op that takes a weight tensor W. a bias vector b, and a visible tensor V, produces a hidden unit tensor H
-# Also parmeterized by integer strides dr,dc,dt
-# H[i,r,c,t,j] = video i within the minibatch, feature map j, location and time within feature map (r,c,t)
-# W[j,k,l,m,z] = weights connecting H[i,r,c,t,j] to V[i,dr*r+k,dc*c+l,dt*t+m,z]
-# b[j] = bias of feature map j
-# V[i,r,c,t,j] = pixel at (r,c,t) within video featuremap j of video i within the minibatch
-# i.e., H[i,j,r,c,t] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i,z, dr*r+k,dc*c+l,dt*t+m]
-# The layouts of these variables are chosen to improve locality of reference.
-# numpy seems to put the largest stride on axis 0 and decrease the stride from there. If we do convolution
-# one filter at a time, one example at a time, then we want the largest strides to
-# be over the examples. We want the smallest stride to be over the input channel because as we change
-# the channel we re-visit the same location in the input.
-# The smallest stride being over the input channel means that the weights need to be formatted with the input
-# channel as the last index
-# partial C / partial b_j =  sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * ( partial H[i,r,c,t,k] / partial b_j )
-# =  sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] )  * delta(k = j)
-# =  sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] )
-# partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * (partial H[i,p,q,r,n] / partial W[j,k,l,m,z])
-# = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) *
-# (partial sum_s sum_u sum_v sum_a  W[n,a, s,u,v] V[i, dr*p+s,dc*q+u,dt*r+v, a] ) / partial W[j,k,l,m,z])
-# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *
-# (partial sum_s sum_u sum_v sum_a W[j,a, s,u,v] V[i,dr*p+s,dc*q+u,dt*r+v,a] ) / partial W[j,k,l,m,z])
-# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *  V[i,dr*p+k,dc*q+l,dt*r+m,z]
-# derivatives wrt V unimplemented for now. derivatives wrt dr, dc, dt are undefined since
-# the output function is only defined when dr, dc, dt are natural numbers.
-class Conv3D(theano.Op):
-    """
-    3D `convolution` of multiple filters on a minibatch.
-    Notes
-    -----
-    Does not flip the kernel, moves kernel with a user specified stride.
-    """
-    __props__ = ()
-    def c_code_cache_version(self):
-        return (3, blas_header_version())
-    def make_node(self, V, W, b, d):
-        """
-        Parameters
-        ----------
-        V
-            Visible unit, input(batch,row,column,time,in channel)
-        W
-            Weights, filter(out channel,row,column,time,in channel)
-        b
-            bias, shape == (W.shape[0],)
-        d
-            strides when moving the filter over the input(dx,dy,dt)
-        """
-        V_ = T.as_tensor_variable(V)
-        W_ = T.as_tensor_variable(W)
-        b_ = T.as_tensor_variable(b)
-        d_ = T.as_tensor_variable(d)
-        bcast = (V_.broadcastable[0], False, False, False, W_.broadcastable[0])
-        node = theano.Apply(self, inputs=[V_, W_, b_, d_],
-                            outputs=[T.TensorType(V_.dtype, bcast)()])
-        return node
-    def grad(self, inputs, output_gradients):
-        V, W, b, d = inputs
-        dCdH, = output_gradients
-        # make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
-        # print dCdH.broadcastable
-        # print "dCdH.broadcastable"
-        # quit(-1)
-        # dCdH = printing.Print("dCdH = ",["shape"])
-        # Make sure the broadcasting pattern of the gradient is the the same
-        # as the initial variable
-        dCdV = theano.tensor.nnet.convTransp3D(
-            W, T.zeros_like(V[0, 0, 0, 0, :]), d, dCdH, V.shape[1:4])
-        dCdV = T.patternbroadcast(dCdV, V.broadcastable)
-        WShape = W.shape
-        dCdW = theano.tensor.nnet.convGrad3D(V, d, WShape, dCdH)
-        dCdW = T.patternbroadcast(dCdW, W.broadcastable)
-        dCdb = T.sum(dCdH, axis=(0, 1, 2, 3))
-        dCdb = T.patternbroadcast(dCdb, b.broadcastable)
-        dCdd = grad_undefined(
-            self, 3, inputs[3],
-            "The gradient of Conv3D with respect to the convolution"
-            " stride is undefined because Conv3D is only defined for"
-            " integer strides.")
-        if 'name' in dir(dCdH) and dCdH.name is not None:
-            dCdH_name = dCdH.name
-        else:
-            dCdH_name = 'anon_dCdH'
-        if 'name' in dir(V) and V.name is not None:
-            V_name = V.name
-        else:
-            V_name = 'anon_V'
-        if 'name' in dir(W) and W.name is not None:
-            W_name = W.name
-        else:
-            W_name = 'anon_W'
-        if 'name' in dir(b) and b.name is not None:
-            b_name = b.name
-        else:
-            b_name = 'anon_b'
-        dCdV.name = 'Conv3D_dCdV(dCdH=' + dCdH_name + ',V=' + V_name + ')'
-        dCdW.name = ('Conv3D_dCdW(dCdH=' + dCdH_name + ',V=' + V_name +
-                     ',W=' + W_name + ')')
-        dCdb.name = ('Conv3D_dCdb(dCdH=' + dCdH_name + ',V=' + V_name +
-                     ',W=' + W_name + ',b=' + b_name + ')')
-        return [dCdV, dCdW, dCdb, dCdd]
-    def perform(self, node, inputs, output_storage):
-        V, W, b, d = inputs
-#        print "Conv3D python code"
-        output_storage[0][0] = computeH(V, W, b, d)
-    def infer_shape(self, node, input_shapes):
-        V, W, b, d = node.inputs
-        V_shape, W_shape, b_shape, d_shape = input_shapes
-        dr = d[0]
-        dc = d[1]
-        dt = d[2]
-        batch_size = V_shape[0]
-        output_channels = W_shape[0]
-        vidHeight = V_shape[1]
-        filterHeight = W_shape[1]
-        vidWidth = V_shape[2]
-        filterWidth = W_shape[2]
-        vidDur = V_shape[3]
-        filterDur = W_shape[3]
-        output_height = ((vidHeight - filterHeight) // dr) + 1
-        output_width = ((vidWidth - filterWidth) // dc) + 1
-        output_dur = ((vidDur - filterDur) // dt) + 1
-        rval = (batch_size, output_height, output_width, output_dur, output_channels)
-        return [rval]
-    def c_support_code(self):
-        return blas_header_text()
-    def c_libraries(self):
-        return ldflags()
-    def c_compile_args(self):
-        flags = ldflags(libs=False, flags=True)
-        return flags
-    def c_lib_dirs(self):
-        return ldflags(libs=False, libs_dir=True)
-    def c_header_dirs(self):
-        return ldflags(libs=False, include_dir=True)
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        V, W, b, d = inputs
-        fail = sub['fail']
-        H = outputs[0]
-        codeSource = """
-            ///////////// < code generated by Conv3D >
-            //printf("\t\t\t\tConv3D c code\\n");
-            //Check dimensionality of inputs
-            if (PyArray_NDIM(%(W)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor");
-                            %(fail)s
-            }
-            if (PyArray_NDIM(%(V)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor");
-                            %(fail)s
-            }
-            if (PyArray_NDIM(%(b)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector.");
-                %(fail)s
-            }
-            if (PyArray_NDIM(%(d)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector.");
-                %(fail)s
-            }
-            if (PyArray_DIMS(%(d)s)[0] != 3)
-            {
-                PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]);
-                %(fail)s
-            }
-            //Read and check sizes of inputs
-{ // exta scope so error handler jumps don't cause errors
-            const int batchSize = PyArray_DIMS(%(V)s)[0];
-            const int outputChannels =  PyArray_DIMS(%(W)s)[0];
-            const int inputChannels = PyArray_DIMS(%(V)s)[4];
-            if (PyArray_DIMS(%(W)s)[4] != inputChannels)
-            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%ld channel image but the image has %%d channels. Overall shape of input: (%%ld,%%ld,%%ld,%%ld,%%ld)", (long)PyArray_DIMS(%(W)s)[4], inputChannels, (long)PyArray_DIMS(%(V)s)[0], (long)PyArray_DIMS(%(V)s)[1], (long)PyArray_DIMS(%(V)s)[2], (long)PyArray_DIMS(%(V)s)[3], (long)PyArray_DIMS(%(V)s)[4]);
-                %(fail)s
-            }
-            if (PyArray_DIMS(%(b)s)[0] != outputChannels)
-            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%ld channel output image but the output has %%d channels", (long)PyArray_DIMS(%(b)s)[0], outputChannels);
-                %(fail)s
-            }
-{  //extra scope so error handler jumps don't cause errors
-            const int filterHeight = PyArray_DIMS(%(W)s)[1];
-            const int filterWidth = PyArray_DIMS(%(W)s)[2];
-            const int filterDur = PyArray_DIMS(%(W)s)[3];
-            const int vidHeight = PyArray_DIMS(%(V)s)[1];
-            const int vidWidth = PyArray_DIMS(%(V)s)[2];
-            const int vidDur = PyArray_DIMS(%(V)s)[3];\
-            if (vidHeight < filterHeight)
-            {
-                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            if (vidWidth < filterWidth)
-            {
-                PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            if (vidDur < filterDur)
-            {
-                PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            //Read and check stride arguments
-            const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
-            const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
-            const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2);
-            if (dr <= 0 || dc <= 0 || dt <= 0)
-            {
-                PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            //Make correctly sized output
-            const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1;
-            const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1;
-            const long long outputDur = int( (vidDur - filterDur) / dt ) +1;
-            npy_intp dims[5];
-            dims[0] = batchSize;
-            dims[4] = outputChannels;
-            dims[1] = outputHeight;
-            dims[2] = outputWidth;
-            dims[3] = outputDur;
-            if(!(%(H)s) || PyArray_DIMS(%(H)s)[0]!=dims[0] ||
-            PyArray_DIMS(%(H)s)[1]!=dims[1] ||
-            PyArray_DIMS(%(H)s)[2]!=dims[2] ||
-            PyArray_DIMS(%(H)s)[3]!=dims[3] ||
-            PyArray_DIMS(%(H)s)[4]!=dims[4]){
-                Py_XDECREF(%(H)s);
-                %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num);
-                if (!(%(H)s)) {
-                    PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output.");
-                    %(fail)s
-                }
-            }
-{ // extra scope so fail works
-            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )
-            const int ws0 = PyArray_STRIDES(%(W)s)[0];
-            const int ws1 = PyArray_STRIDES(%(W)s)[1];
-            const int ws2 = PyArray_STRIDES(%(W)s)[2];
-            const int vs1 = PyArray_STRIDES(%(V)s)[1];
-            const int ws4 = PyArray_STRIDES(%(W)s)[4];
-            const int vs4 = PyArray_STRIDES(%(V)s)[4];
-            const int ws3 = PyArray_STRIDES(%(W)s)[3];
-            const int vs3 = PyArray_STRIDES(%(V)s)[3];
-            const int vs2 = PyArray_STRIDES(%(V)s)[2];
-            const int bs  = PyArray_STRIDES(%(b)s)[0];
-            const int hs4 = PyArray_STRIDES(%(H)s)[4];
-            // Compute H
-            //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m]
-            //TODO: add special cases
-            // ex: filterDur == 1 && batchSize == 1 && dt = 1  (for SFA)
-            // ex: inputChannels == 1 """
-        # if the data types are not mixed, we can insert special case
-        # optimizations based on BLAS
-        VV, WV, bv, dv = node.inputs
-        HV = node.outputs[0]
-        if (theano.config.blas.ldflags and
-                VV.dtype == WV.dtype and HV.dtype == VV.dtype):
-            if VV.dtype == 'float64':
-                gemv = 'dgemv_'
-            elif VV.dtype == 'float32':
-                gemv = 'sgemv_'
-            else:
-                raise Exception('Unrecognized dtype for convolution ' + V.value.dtype)
-            codeSource += """
-            if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
-            {
-              //std::cout << "lots of channels special case code" << std::endl;
-              #define blas_type dtype_ ## %(V)s
-              const blas_type  constant_one = 1.0;
-              char N = 'T';
-              int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0));
-              int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4));
-              int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4));
-                //special case code for the "lots of channels" case
-                //uses a BLAS matrix vector multiply to compute the contribute for
-                //all channels of an input pixel to all channels of an output pixel
-                //simultaneously
-              long long Hpos = 0;
-              long long Vpos = 0;
-              for (int i = 0; i < batchSize; i++) {
-                    long long Hposi = Hpos;
-                    long long Vposi = Vpos;
-                    for (int r = 0;  r < outputHeight; r++) {
-                      long long Hposr = Hpos;
-                      long long Vposr = Vpos;
-                      for (int c = 0; c < outputWidth; c++) {
-                       long long Hposc = Hpos;
-                       long long Vposc = Vpos;
-                       for (int t = 0; t < outputDur; t++) {
-                            long long Hpost = Hpos;
-                            long long Vpost = Vpos;
-                            //of the loops so far, j should be the innermost, because
-                            //each loop through j visits the same elements of V
-                            //this implies that the last index of H should be the j index
-                            //since V and H should have the same format, this means
-                            //z should be the last index in v, and therefore the innermost
-                            //of the next set of for loops
-                            int Wpos = 0;
-                            int bPos = 0;
-                            long long Hposj = Hpos;
-                            for (int j = 0; j < outputChannels; j++) {
-                                // H[i,r,c,t,j] = b[j]
-                                ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos);
-                                Hposj += hs4;
-                                bPos += bs;
-                            }
-                            dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);
-                            for (int k =0; k < filterHeight; k++) {
-                                  int Wposk = Wpos;
-                                  long long Vposk = Vpos;
-                                  for (int l = 0; l < filterWidth; l++) {
-                                    int Wposl = Wpos;
-                                    long long Vposl = Vpos;
-                                    for (int m = 0; m < filterDur; m++) {
-                                      //H[i,r,c,t,:] += np.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])
-                                      //note: changing the weights so that outputChannels and inputChannels were the last two rather than
-                                      //the first and last elements did not speed this up, even for extremely large input sizes
-                                      %(gemv)s(&N, & inputChannels, & outputChannels,
-                     &constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e,
-                     & ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one,
-                     writePos,& hs4e);
-                                      Wpos  += ws3;
-                                      Vpos  += vs3;
-                                    } // close m
-                                    Wpos = Wposl + ws2;
-                                    Vpos = Vposl + vs2;
-                                  } //close l
-                                  Wpos = Wposk + PyArray_STRIDES(%(W)s)[1];
-                                  Vpos = Vposk + PyArray_STRIDES(%(V)s)[1];
-                                } //close k
-                             Hpos = Hpost + PyArray_STRIDES(%(H)s)[3];
-                             Vpos = Vpost + vs3 * dt;
-                         } //close t
-                         Hpos = Hposc + PyArray_STRIDES(%(H)s)[2];
-                         Vpos = Vposc + vs2 * dc;
-                       } //close c
-                       Hpos = Hposr + PyArray_STRIDES(%(H)s)[1];
-                       Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr;
-                   } //closes r
-                   Hpos = Hposi + PyArray_STRIDES(%(H)s)[0];
-                   Vpos = Vposi + PyArray_STRIDES(%(V)s)[0];
-              } //closes i
-            } //closes "lots of channels" special case code
-            else
-"""
-        codeSource += """
-            {
-              //General case code
-              //std::cout << "general case code" << std::endl;
-              long long Hpos = 0;
-              long long Vpos = 0;
-              for (int i = 0; i < batchSize; i++) {
-                    long long Hposi = Hpos;
-                    long long Vposi = Vpos;
-                    for (int r = 0;  r < outputHeight; r++) {
-                      long long Hposr = Hpos;
-                      long long Vposr = Vpos;
-                      for (int c = 0; c < outputWidth; c++) {
-                       long long Hposc = Hpos;
-                       long long Vposc = Vpos;
-                       for (int t = 0; t < outputDur; t++) {
-                            long long Hpost = Hpos;
-                            long long Vpost = Vpos;
-                            //of the loops so far, j should be the innermost, because
-                            //each loop through j visits the same elements of V
-                            //this implies that the last index of H should be the j index
-                            //since V and H should have the same format, this means
-                            //z should be the last index in v, and therefore the innermost
-                            //of the next set of for loops
-                            int Wpos = 0;
-                            int bPos = 0;
-                            for (int j = 0; j < outputChannels; j++) {
-                                long long Hposj = Hpos;
-                                long long Vposj = Vpos;
-                                int Wposj = Wpos;
-                                // H[i,r,c,t,j] = b[j]
-                                dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);
-                                writePos = ELEM_AT(%(b)s,bPos);
-                                for (int k =0; k < filterHeight; k++) {
-                                  int Wposk = Wpos;
-                                  long long Vposk = Vpos;
-                                  for (int l = 0; l < filterWidth; l++) {
-                                    int Wposl = Wpos;
-                                    long long Vposl = Vpos;
-                                    for (int m = 0; m < filterDur; m++) {
-                                      int Wposm = Wpos;
-                                      long long Vposm = Vpos;
-                                      for (int z = 0; z < inputChannels; z++) {
-                                        //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]
-                                        writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);
-                                        Wpos += ws4;
-                                        Vpos += vs4;
-                                      } // close z
-                                      Wpos = Wposm + ws3;
-                                      Vpos = Vposm + vs3;
-                                    } // close m
-                                    Wpos = Wposl + ws2;
-                                    Vpos = Vposl + vs2;
-                                  } //close l
-                                  Wpos = Wposk + PyArray_STRIDES(%(W)s)[1];
-                                  Vpos = Vposk + PyArray_STRIDES(%(V)s)[1];
-                                } //close k
-                              bPos += bs;
-                              Wpos = Wposj + ws0;
-                              Hpos = Hposj +  hs4;
-                              Vpos = Vposj;
-                              //std::cout << "incremented Wpos by " << ws0 << std::endl;
-                              //std::cout << "incremented Hpos by " << hs4 << std::endl;
-                             } //close j
-                             Hpos = Hpost + PyArray_STRIDES(%(H)s)[3];
-                             Vpos = Vpost + vs3 * dt;
-                         } //close t
-                         Hpos = Hposc + PyArray_STRIDES(%(H)s)[2];
-                         Vpos = Vposc + vs2 * dc;
-                       } //close c
-                       Hpos = Hposr + PyArray_STRIDES(%(H)s)[1];
-                       Vpos = Vposr + PyArray_STRIDES(%(V)s)[1] * dr;
-                   } //closes r
-                   Hpos = Hposi + PyArray_STRIDES(%(H)s)[0];
-                   Vpos = Vposi + PyArray_STRIDES(%(V)s)[0];
-              } //closes i
-            } //closes general case code
-}}}}}}} //extra scope so error handler jumps don't cross declarations
-            ///////////// < /code generated by Conv3D >
-        """
-        return strutil.render_string(codeSource, locals())
-_conv3D = Conv3D()
-def conv3D(V, W, b, d):
-    """
-    3D "convolution" of multiple filters on a minibatch.
-    (does not flip the kernel, moves kernel with a user specified stride)
-    Parameters
-    ----------
-    V
-        Visible unit, input.
-        Dimensions: (batch, row, column, time, in channel).
-    W
-        Weights, filter.
-        Dimensions: (out channel, row, column, time ,in channel).
-    b
-        Bias, shape == (W.shape[0],).
-    d
-        Strides when moving the filter over the input(dx, dy, dt).
-    Notes
-    -----
-    The order of dimensions does not correspond to the one in `conv2d`.
-    This is for optimization.
-    Please use nnet.conv3d instead of this for a faster GPU implementation.
-    See Also
-    --------
-    Someone made a script that shows how to swap the axes
-    between both 3d convolution implementations in Theano. See
-    the last `attachment <https://groups.google.com/d/msg/theano-users/1S9_bZgHxVw/0cQR9a4riFUJ>`_
-"""
-    return _conv3D(V, W, b, d)
-def computeH(V, W, b, d):
-    assert len(W.shape) == 5
-    assert len(V.shape) == 5
-    if len(b.shape) != 1:
-        print(b.shape)
-        assert False
-    assert len(d) == 3
-    batchSize = V.shape[0]
-    outputChannels = W.shape[0]
-    inputChannels = V.shape[4]
-    if W.shape[4] != inputChannels:
-        raise Exception("W.shape[4] = " + str(W.shape[4]) + " but inputChannels = " + str(inputChannels))
-    filterHeight = W.shape[1]
-    filterWidth = W.shape[2]
-    filterDur = W.shape[3]
-    vidHeight = V.shape[1]
-    vidWidth = V.shape[2]
-    vidDur = V.shape[3]
-    assert vidHeight >= filterHeight
-    assert vidWidth >= filterWidth
-    assert vidDur >= filterDur
-    dx, dy, dt = d
-    assert dx > 0
-    assert dy > 0
-    assert dt > 0
-    outputHeight = int((vidHeight - filterHeight) / dx) + 1
-    outputWidth = int((vidWidth - filterWidth) / dy) + 1
-    outputDur = int((vidDur - filterDur) / dt) + 1
-    H = np.zeros((batchSize, outputHeight,
-                 outputWidth, outputDur, outputChannels), dtype=V.dtype)
-    # H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dx*x+k,dy*y+l,dt*t+m]
-    for i in xrange(0, H.shape[0]):
-        # print '\texample '+str(i+1)+'/'+str(H.shape[0])
-        for j in xrange(0, H.shape[4]):
-                # print '\t\tfeature map '+str(j+1)+'/'+str(H.shape[1])
-            for x in xrange(0, H.shape[1]):
-                # print '\t\t\trow '+str(x+1)+'/'+str(H.shape[2])
-                for y in xrange(0, H.shape[2]):
-                    for t in xrange(0, H.shape[3]):
-                        H[i, x, y, t, j] = b[j]
-                        for k in xrange(0, filterHeight):
-                            for l in xrange(0, filterWidth):
-                                for m in xrange(0, filterDur):
-                                    for z in xrange(0, inputChannels):
-                                        # if (i,j,x,y,t) == (0,0,0,0,0):
-                                        #    print (( W[j,z,k,l,m] , V[i,z,d[0]*x+k,d[1]*y+l,d[2]*t+m] ), (k,l,m) )
-                                        w = W[j, k, l, m, z]
-                                        v = V[i, d[0] * x + k, d[1] * y + l, d[2] * t + m, z]
-                                        # if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
-                                        #    print 'setting H[0] += '+str(w*v)+'   W['+str((j,z,k,l,m))+']='+str(w)+'   V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
-                                        H[i, x, y, t, j] += w * v
-    return H
--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
-from __future__ import absolute_import, print_function, division
-from six.moves import xrange
-import numpy as np
-import theano
-from theano.tensor import basic as T
-from theano.misc import strutil
-from theano.gradient import grad_undefined
-from theano.gradient import DisconnectedType
-# TODO: speed up by reordering loops. Should pass through the videos once, incrementing all weight gradients, rather
-# than visiting each weight gradient element once and passing through whole video
-class ConvGrad3D(theano.Op):
-    """
-    Gradient of Conv3D with respect to W.
-    """
-    __props__ = ()
-    def c_code_cache_version(self):
-        return (1,)
-    def make_node(self, V, d, WShape, dCdH):
-        V_ = T.as_tensor_variable(V)
-        d_ = T.as_tensor_variable(d)
-        WShape_ = T.as_tensor_variable(WShape)
-        dCdH_ = T.as_tensor_variable(dCdH)
-        return theano.Apply(self,
-                            inputs=[V_, d_, WShape_, dCdH_],
-                            outputs=[T.TensorType(
-                                V_.dtype,
-                                (False, False, False, False, False))()])
-    def infer_shape(self, node, input_shapes):
-        V, d, W_shape, dCdH = node.inputs
-        return [(W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4])]
-    def connection_pattern(self, node):
-        return [[True], [True], [False], [True]]
-    def grad(self, inputs, output_gradients):
-        C, d, WShape, B = inputs
-        dLdA, = output_gradients
-        z = T.zeros_like(C[0, 0, 0, 0, :])
-        dLdC = theano.tensor.nnet.convTransp3D(dLdA, z, d, B, C.shape[1:4])
-        # d actually does affect the outputs, so it's not disconnected
-        dLdd = grad_undefined(self, 1, d)
-        # The shape of the weights doesn't affect the output elements
-        dLdWShape = DisconnectedType()()
-        dLdB = theano.tensor.nnet.conv3D(C, dLdA, T.zeros_like(B[0, 0, 0, 0, :]), d)
-        return [dLdC, dLdd, dLdWShape, dLdB]
-    def perform(self, node, inputs, output_storage):
-        V, d, WShape, dCdH = inputs
-#        print "ConvGradW3D python code"
-        # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]
-        batchSize = dCdH.shape[0]
-        outputHeight = dCdH.shape[1]
-        outputWidth = dCdH.shape[2]
-        outputDur = dCdH.shape[3]
-        assert V.shape[0] == batchSize
-        dr, dc, dt = d
-        dCdW = np.zeros(WShape, dtype=V.dtype)
-        # print 'computing output of shape '+str(WShape)
-        for k in xrange(0, WShape[1]):
-            for l in xrange(0, WShape[2]):
-                for m in xrange(0, WShape[3]):
-                    for i in xrange(0, batchSize):
-                        for p in xrange(0, outputHeight):
-                            for q in xrange(0, outputWidth):
-                                for r in xrange(0, outputDur):
-                                    for j in xrange(0, WShape[0]):
-                                        for z in xrange(0, WShape[4]):
-                                            dCdW[j, k, l, m, z] += (
-                                                dCdH[i, p, q, r, j] *
-                                                V[i, dr * p + k, dc * q + l,
-                                                  dt * r + m, z])
-        output_storage[0][0] = dCdW
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        V, d, WShape, dCdH = inputs
-        fail = sub['fail']
-        dCdW = outputs[0]
-        codeSource = """
-            ///////////// < code generated by ConvGradW3D >
-            //printf("\t\t\t\tConvGradW3D c code\\n");
-            //Check dimensionality of inputs
-            if (PyArray_NDIM(%(dCdH)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
-                            %(fail)s
-            }
-            if (PyArray_NDIM(%(V)s) != 5)
-            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
-                %(fail)s
-            }
-            if (PyArray_NDIM(%(WShape)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be a vector.");
-                %(fail)s
-            }
-            if (PyArray_NDIM(%(d)s) != 1)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: d must be a vector.");
-                %(fail)s
-            }
-            if (PyArray_DIMS(%(d)s)[0] != 3)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: 3 stride length arguments expected (row, col, time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0]);
-                %(fail)s
-            }
-{ //extra scope so that fail will not jump over declarations
-            //Read and check sizes of inputs
-            const int batchSize = PyArray_DIMS(%(V)s)[0];
-            if (PyArray_DIMS(%(WShape)s)[0] != 5)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must specify a 5D shape");
-                %(fail)s
-            }
-            if (!PyArray_ISCONTIGUOUS(%(WShape)s))
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
-                %(fail)s
-            }
-{ //extra scope so that fail will not jump over declarations
-            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) PyArray_DATA(%(WShape)s);
-            const int outputChannels =  WShape[0];
-            const int inputChannels = PyArray_DIMS(%(V)s)[4];
-            if (WShape[4] != inputChannels)
-            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
-                %(fail)s
-            }
-{ //extra scope so fail works
-            const int filterHeight = WShape[1];
-            const int filterWidth = WShape[2];
-            const int filterDur = WShape[3];
-            const int vidHeight = PyArray_DIMS(%(V)s)[1];
-            const int vidWidth = PyArray_DIMS(%(V)s)[2];
-            const int vidDur = PyArray_DIMS(%(V)s)[3];
-            if (vidHeight < filterHeight)
-            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
-                %(fail)s
-            }
-            if (vidWidth < filterWidth)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a width of %%i but V is only %%i pixels tall",filterWidth,vidWidth);
-                %(fail)s
-            }
-            if (vidDur < filterDur)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            //Read and check stride arguments
-            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
-            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
-            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
-            if (dr <= 0 || dc <= 0 || dt <= 0)
-            {
-                PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt);
-                %(fail)s
-            }
-{ // extra scope so fail works
-            //Compute correct sized of output
-            const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
-            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
-            const int outputDur = int( (vidDur - filterDur) / dt ) +1;
-            if (PyArray_DIMS(%(dCdH)s)[0] != batchSize ||
-                PyArray_DIMS(%(dCdH)s)[4] != outputChannels ||
-                PyArray_DIMS(%(dCdH)s)[1] != outputHeight ||
-                PyArray_DIMS(%(dCdH)s)[2] != outputWidth ||
-                PyArray_DIMS(%(dCdH)s)[3] != outputDur)
-            {
-                PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize,  outputHeight, outputWidth, outputDur, outputChannels, (long)PyArray_DIMS(%(dCdH)s)[0], (long)PyArray_DIMS(%(dCdH)s)[1], (long)PyArray_DIMS(%(dCdH)s)[2], (long)PyArray_DIMS(%(dCdH)s)[3], (long)PyArray_DIMS(%(dCdH)s)[4]);
-                %(fail)s
-            }
-{ // extra scope for fail
-            npy_intp dims[5];
-            dims[0] = outputChannels;
-            dims[4] = inputChannels;
-            dims[1] = filterHeight;
-            dims[2] = filterWidth;
-            dims[3] = filterDur;
-            if(!(%(dCdW)s)  || PyArray_DIMS(%(dCdW)s)[0]!=dims[0] ||
-                  PyArray_DIMS(%(dCdW)s)[1]!=dims[1] ||
-                  PyArray_DIMS(%(dCdW)s)[2]!=dims[2] ||
-                  PyArray_DIMS(%(dCdW)s)[3]!=dims[3] ||
-                  PyArray_DIMS(%(dCdW)s)[4]!=dims[4] ){
-               Py_XDECREF(%(dCdW)s);
-               %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(V)s)->type_num);
-               if (!(%(dCdW)s)) {
-                  PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW");
-                %(fail)s
-               }
-            }
-{ //extra scope so fail works
-            #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )
-            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )
-            const int dhs3 = PyArray_STRIDES(%(dCdH)s)[3];
-            const int dtvs3 = dt * PyArray_STRIDES(%(V)s)[3];
-            // Compute dCdW
-            //TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5
-            // dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j]  *  V[i,dr*p+k,dc*q+l,dt*r+m,z]
-            for (int j = 0; j < outputChannels; j++) {
-                for (int z = 0; z < inputChannels; z++) {
-                    for (int k = 0; k < filterHeight; k++) {
-                        for (int l = 0; l < filterWidth; l++) {
-                            for (int m = 0; m < filterDur; m++) {
-                                //printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z);
-                                dtype_%(dCdW)s & writePos =  ELEM5(%(dCdW)s, j,k,l,m,z);
-                                writePos = 0;
-                                for (int i = 0; i < batchSize; i++) {
-                                    for (int p = 0; p < outputHeight; p++) {
-                                        for (int q = 0; q < outputWidth; q++) {
-                                            int Hpos = i * PyArray_STRIDES(%(dCdH)s)[0] + j * PyArray_STRIDES(%(dCdH)s)[4] + p * PyArray_STRIDES(%(dCdH)s)[1] + q * PyArray_STRIDES(%(dCdH)s)[2] ;
-                                            int Vpos = i * PyArray_STRIDES(%(V)s)[0] + z * PyArray_STRIDES(%(V)s)[4] +  (dr * p+k) * PyArray_STRIDES(%(V)s)[1] +  (dc*q+l) * PyArray_STRIDES(%(V)s)[2] + m * PyArray_STRIDES(%(V)s)[3];
-                                            for (int r = 0; r < outputDur; r++) {
-                                                writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
-                                                //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
-                                                Hpos += dhs3;
-                                                Vpos += dtvs3;
-                                            }
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-}}}}}}} // extra scope for fail
-            ///////////// < /code generated by ConvGradW3D >
-        """
-        return strutil.render_string(codeSource, locals())
-convGrad3D = ConvGrad3D()
--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
-from __future__ import absolute_import, print_function, division
-import numpy as np
-from six.moves import xrange
-import theano
-from theano.tensor import basic as T
-from theano.misc import strutil
-from theano.gradient import grad_undefined
-from theano.gradient import DisconnectedType
-class ConvTransp3D(theano.Op):
-    """
-    "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly
-    defined matrix W. This implements multiplication by its transpose).
-    """
-    __props__ = ()
-    def c_code_cache_version(self):
-        return (3,)
-    def make_node(self, W, b, d, H, RShape=None):
-        """
-        Parameters
-        ----------
-        W
-            Weights, filter
-        b
-            Bias, shape == (W.shape[0],).
-        d
-            Strides when moving the filter over the input.
-        H
-            The output of Conv3D.
-        """
-        W_ = T.as_tensor_variable(W)
-        b_ = T.as_tensor_variable(b)
-        d_ = T.as_tensor_variable(d)
-        H_ = T.as_tensor_variable(H)
-        if RShape:
-            RShape_ = T.as_tensor_variable(RShape)
-        else:
-            RShape_ = T.as_tensor_variable([-1, -1, -1])
-        return theano.Apply(self,
-                            inputs=[W_, b_, d_, H_, RShape_],
-                            outputs=[T.TensorType(H_.dtype,
-                                     (False, False, False, False, False))()])
-    def infer_shape(self, node, input_shapes):
-        W, b, d, H, RShape = node.inputs
-        W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
-        return [(H_shape[0], RShape[0], RShape[1], RShape[2], W_shape[4])]
-    def connection_pattern(self, node):
-        return [[True], [True], [True], [True], [False]]
-    def grad(self, inputs, output_gradients):
-        W, b, d, H, RShape = inputs
-        dCdR, = output_gradients
-        dCdH = theano.tensor.nnet.conv3D(dCdR, W, T.zeros_like(H[0, 0, 0, 0, :]), d)
-        WShape = W.shape
-        dCdW = theano.tensor.nnet.convGrad3D(dCdR, d, WShape, H)
-        dCdb = T.sum(dCdR, axis=(0, 1, 2, 3))
-        # not differentiable, since d affects the output elements
-        dCdd = grad_undefined(self, 2, d)
-        # disconnected, since RShape just determines the output shape
-        dCdRShape = DisconnectedType()()
-        if 'name' in dir(dCdR) and dCdR.name is not None:
-            dCdR_name = dCdR.name
-        else:
-            dCdR_name = 'anon_dCdR'
-        if 'name' in dir(H) and H.name is not None:
-            H_name = H.name
-        else:
-            H_name = 'anon_H'
-        if 'name' in dir(W) and W.name is not None:
-            W_name = W.name
-        else:
-            W_name = 'anon_W'
-        if 'name' in dir(b) and b.name is not None:
-            b_name = b.name
-        else:
-            b_name = 'anon_b'
-        dCdW.name = ('ConvTransp3D_dCdW.H=' + H_name + ',dCdR=' + dCdR_name +
-                     ',W=' + W_name)
-        dCdb.name = ('ConvTransp3D_dCdb.H=' + H_name + ',dCdR=' + dCdR_name +
-                     ',W=' + W_name + ',b=' + b_name)
-        dCdH.name = 'ConvTransp3D_dCdH.H=' + H_name + ',dCdR=' + dCdR_name
-        return [dCdW, dCdb, dCdd, dCdH, dCdRShape]
-    def perform(self, node, inputs, output_storage):
-        W, b, d, H, RShape = inputs
-#        print "\t\t\t\tConvTransp3D python code"
-        output_storage[0][0] = computeR(W, b, d, H, RShape)
-    def c_code(self, node, nodename, inputs, outputs, sub):
-        W, b, d, H, RShape = inputs
-        fail = sub['fail']
-        R = outputs[0]
-        codeSource = """
-                    ///////////// < code generated by ConvTransp3D >
-                    //printf("\t\t\t\tConvTransp3D c code\\n");
-                    //Check dimensionality of inputs
-                    if (PyArray_NDIM(%(H)s) != 5)
-                    {
-                        PyErr_Format(PyExc_ValueError,
-                                     "H must be a 5-D tensor but it is %%i-D",
-                                     PyArray_NDIM(%(H)s));
-                        %(fail)s
-                    }
-                    if (PyArray_NDIM(%(W)s) != 5)
-                    {
-                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor");
-                %(fail)s
-                    }
-                    if (PyArray_NDIM(%(b)s) != 1)
-                    {
-                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector");
-                         %(fail)s
-                    }
-                    if (PyArray_NDIM(%(d)s) != 1)
-                    {
-                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector");
-                         %(fail)s
-                    }
-                    //Read and check stride arguments
-                    if (PyArray_DIMS(%(d)s)[0] != 3)
-                    {
-                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", (long)PyArray_DIMS(%(d)s)[0] );
-                         %(fail)s
-                    }
-                    { // for fail 1
-                         int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
-                         int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
-                         int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
-                         if (dr <= 0 || dc <= 0 || dt <= 0)
-                         {
-                             PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
-                             %(fail)s
-                          }
-                         //Read and check sizes of inputs
-                        { // for fail 2
-                            const int batchSize = PyArray_DIMS(%(H)s)[0];
-                            const int outputChannels =  PyArray_DIMS(%(W)s)[0];
-                            if (PyArray_DIMS(%(H)s)[4] != outputChannels)
-                            {
-                                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li, %%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)", outputChannels, (long)PyArray_DIMS(%(H)s)[4], (long)PyArray_DIMS(%(W)s)[0], (long)PyArray_DIMS(%(W)s)[1], (long)PyArray_DIMS(%(W)s)[2], (long)PyArray_DIMS(%(W)s)[3], (long)PyArray_DIMS(%(W)s)[4], (long)PyArray_DIMS(%(H)s)[0], (long)PyArray_DIMS(%(H)s)[1], (long)PyArray_DIMS(%(H)s)[2], (long)PyArray_DIMS(%(H)s)[3], (long)PyArray_DIMS(%(H)s)[4]);
-                                %(fail)s
-                            }
-                            { // for fail 3
-                                const int inputChannels = PyArray_DIMS(%(W)s)[4];
-                                if (PyArray_DIMS(%(b)s)[0] != inputChannels)
-                                {
-                                    PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", (long)PyArray_DIMS(%(b)s)[0], inputChannels );
-                                    %(fail)s
-                                }
-                                { // for fail 4
-                                const int filterHeight = PyArray_DIMS(%(W)s)[1];
-                                const int filterWidth = PyArray_DIMS(%(W)s)[2];
-                                const int filterDur = PyArray_DIMS(%(W)s)[3];
-                                const int outputHeight = PyArray_DIMS(%(H)s)[1];
-                                const int outputWidth = PyArray_DIMS(%(H)s)[2];
-                                const int outputDur = PyArray_DIMS(%(H)s)[3];
-                                int videoHeight = (outputHeight-1) * dr + filterHeight;
-                                int videoWidth = (outputWidth-1) * dc + filterWidth;
-                                int videoDur = (outputDur-1) * dt + filterDur;
-                                if (%(RShape)s)
-                                {
-                                    if (PyArray_NDIM(%(RShape)s) != 1)
-                                    {
-                                        PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector");
-                                        %(fail)s
-                                    }
-                                    if (PyArray_DIMS(%(RShape)s)[0] != 3)
-                                    {
-                                        PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
-                                        %(fail)s
-                                    }
-                                    dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
-                                    dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
-                                    dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
-                                    if (RShape0 != -1)
-                                    {
-                                        if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
-                                        {
-                                            PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2);
-                                            %(fail)s
-                                        }
-                                        videoHeight = RShape0;
-                                        videoWidth = RShape1;
-                                        videoDur = RShape2;
-                                   }
-                               } //closes if RShape
-                               { // for fail 5
-                                   //Allocate the reconstruction
-                                   npy_intp dims[5];
-                                   dims[0] = batchSize;
-                                   dims[4] = inputChannels;
-                                   dims[1] = videoHeight;
-                                   dims[2] = videoWidth;
-                                   dims[3] = videoDur;
-                                   if(!(%(R)s) || PyArray_DIMS(%(R)s)[0]!=dims[0] ||
-                                    PyArray_DIMS(%(R)s)[1]!=dims[1] ||
-                                    PyArray_DIMS(%(R)s)[2]!=dims[2] ||
-                                    PyArray_DIMS(%(R)s)[3]!=dims[3] ||
-                                    PyArray_DIMS(%(R)s)[4]!=dims[4])
-                                   {
-                                       Py_XDECREF(%(R)s);
-                                       %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, PyArray_DESCR(%(H)s)->type_num);
-                                       if (!(%(R)s)) {
-                                           PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
-                                           %(fail)s
-                                       }
-                                   }
-                                   { // for fail 6
-                                       #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i)*PyArray_STRIDES(x)[0]+(j)*PyArray_STRIDES(x)[1]+(k)*PyArray_STRIDES(x)[2]+(l)*PyArray_STRIDES(x)[3]+(m)*PyArray_STRIDES(x)[4] )
-                                       #define ELEM_AT(x, i) * ( dtype_ ## x *) ( PyArray_BYTES(x) + (i) )
-                                       dtype_%(b)s * b = (dtype_%(b)s *) PyArray_DATA(%(b)s);
-                                       int rs4 = PyArray_STRIDES(%(R)s)[4];
-                                       int ws0 = PyArray_STRIDES(%(W)s)[0];
-                                       int ws4 = PyArray_STRIDES(%(W)s)[4];
-                                       int hs4 = PyArray_STRIDES(%(H)s)[4];
-                                       // Compute R
-                                       // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]
-                                       for (int i = 0; i < batchSize; i++) {
-                                        for (int r = 0; r < videoHeight; r++) {
-                                         const int frc = (int)std::max(0.0f, ceilf(float(r-filterHeight+1)/float(dr)));
-                                         for (int c = 0; c < videoWidth; c++) {
-                                          const int fcc = (int)std::max(0.0f, ceilf(float(c-filterWidth +1)/float(dc)));
-                                          for (int t = 0; t < videoDur; t++) {
-                                           const int ftc = (int)std::max(0.0f, ceilf(float(t-filterDur +1)  /float(dt)));
-                                           long long Rpost = i * PyArray_STRIDES(%(R)s)[0] + r * PyArray_STRIDES(%(R)s)[1] + c * PyArray_STRIDES(%(R)s)[2] + t * PyArray_STRIDES(%(R)s)[3];
-                                           long long Rpos = Rpost;
-                                           for (int j = 0; j < inputChannels; j++)
-                                           {
-                                            //ELEM5(%(R)s, i,r,c,t,j) = b[j];
-                                            ELEM_AT(%(R)s,Rpos) = b[j];
-                                            Rpos += rs4;
-                                           }
-                                           for (int rc = frc; rc < outputHeight; rc++) {
-                                            const int rk = r - rc * dr;
-                                            if (rk < 0) break;
-                                            for (int cc = fcc; cc < outputWidth; cc++) {
-                                             const int ck = c - cc * dc;
-                                             if (ck < 0) break;
-                                             for (int tc = ftc; tc < outputDur; tc++)
-                                             {
-                                              const int tk = t - tc * dt;
-                                              if (tk < 0) break;
-                                              int Wpos = rk * PyArray_STRIDES(%(W)s)[1] +  ck * PyArray_STRIDES(%(W)s)[2] + tk * PyArray_STRIDES(%(W)s)[3];
-                                              int Hpostc = i * PyArray_STRIDES(%(H)s)[0] +      rc * PyArray_STRIDES(%(H)s)[1] +  cc * PyArray_STRIDES(%(H)s)[2] + tc * PyArray_STRIDES(%(H)s)[3];
-                                              Rpos = Rpost;
-                                              for (int j = 0; j < inputChannels; j++)
-                                              {
-                                               int Wposj = Wpos;
-                                               dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);
-                                               int Hpos = Hpostc;
-                                               for (int k = 0; k < outputChannels; k++) {
-                                                //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
-                                                //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
-                                                //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
-                                                writePos  += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
-                                                Wpos += ws0;
-                                                Hpos += hs4;
-                                               } //close the k loop
-                                               Rpos += rs4;
-                                               Wpos = Wposj +  ws4;
-                                              } //close the j loop
-                                             } // close the tc loop
-                                            } //cc
-                                           } //rc
-                                          } //t
-                                         } //c
-                                        } //r
-                                       } //i
-                                   } //for fail 6
-                               } //for fail 5
-                           } //for fail 4
-                       } //for fail 3
-                   } //for fail 2
-               } // for fail 1
-               ///////////// < /code generated by ConvTransp3D >
-                     """
-        return strutil.render_string(codeSource, locals())
-convTransp3D = ConvTransp3D()
-# If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
-def computeR(W, b, d, H, Rshape=None):
-    assert len(W.shape) == 5
-    assert len(H.shape) == 5
-    assert len(b.shape) == 1
-    assert len(d) == 3
-    outputChannels, filterHeight, filterWidth, filterDur, \
-        inputChannels = W.shape
-    batchSize, outputHeight, outputWidth, outputDur, \
-        outputChannelsAgain = H.shape
-    assert outputChannelsAgain == outputChannels
-    assert b.shape[0] == inputChannels
-    dr, dc, dt = d
-    assert dr > 0
-    assert dc > 0
-    assert dt > 0
-    videoHeight = (outputHeight - 1) * dr + filterHeight
-    videoWidth = (outputWidth - 1) * dc + filterWidth
-    videoDur = (outputDur - 1) * dt + filterDur
-    if Rshape is not None and Rshape[0] != -1:
-        if Rshape[0] < videoHeight:
-            print((Rshape[0], videoHeight))
-            assert False
-        assert Rshape[1] >= videoWidth
-        assert Rshape[2] >= videoDur
-        # print "setting video size to Rshape = "+str(Rshape)
-        videoHeight, videoWidth, videoDur = Rshape
-    # else:
-    #       print "No Rshape passed in"
-    # print "video size: "+str((videoHeight, videoWidth, videoDur))
-    R = np.zeros((batchSize, videoHeight,
-                 videoWidth, videoDur, inputChannels), dtype=H.dtype)
-    # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
-    for i in xrange(0, batchSize):
-        # print '\texample '+str(i+1)+'/'+str(batchSize)
-        for j in xrange(0, inputChannels):
-            # print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
-            for r in xrange(0, videoHeight):
-                # print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
-                for c in xrange(0, videoWidth):
-                    for t in xrange(0, videoDur):
-                        R[i, r, c, t, j] = b[j]
-                        ftc = max([0, int(np.ceil(
-                            float(t - filterDur + 1) / float(dt)))])
-                        fcc = max([0, int(np.ceil(
-                            float(c - filterWidth + 1) / float(dc)))])
-                        rc = max([0, int(np.ceil(
-                            float(r - filterHeight + 1) / float(dr)))])
-                        while rc < outputHeight:
-                            rk = r - rc * dr
-                            if rk < 0:
-                                break
-                            cc = fcc
-                            while cc < outputWidth:
-                                ck = c - cc * dc
-                                if ck < 0:
-                                    break
-                                tc = ftc
-                                while tc < outputDur:
-                                    tk = t - tc * dt
-                                    if tk < 0:
-                                        break
-                                    R[i, r, c, t, j] += np.dot(
-                                        W[:, rk, ck, tk, j], H[i, rc, cc, tc, :])
-                                    tc += 1
-                                ""  # close loop over tc
-                                cc += 1
-                            ""  # close loop over cc
-                            rc += 1
-                        ""  # close loop over rc
-                    ""  # close loop over t
-                ""  # close loop over c
-            ""  # close loop over r
-        ""  # close loop over j
-    ""  # close loop over i
-    return R
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
@@ -22,9 +22,6 @@ from .nnet import (
    confusion_matrix, softsign)
 from . import opt
 from .conv import ConvOp
-from .Conv3D import *
-from .ConvGrad3D import *
-from .ConvTransp3D import *
 from .sigm import (softplus, sigmoid, sigmoid_inplace,
                   scalar_sigmoid, ultra_fast_sigmoid,
                   hard_sigmoid)

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -854,35 +854,9 @@ class ConvOp(OpenMPOp):
            raise NotImplementedError('todo')
        if self.out_mode == 'valid' and (self.dx, self.dy) != (1, 1):
-            # Use the gradient as defined in conv3D, because the implementation
+            raise NotImplementedError(
-            # by Conv is slow (about 3x slower than conv3D, and probably 10x
+                "ERROR: ConvOp.grad is now disabled for 'valid' convolutions with"
-            # slower than it could be), and incorrect when dx or dy > 2.
+                " stride != (1, 1); call theano.tensor.nnet.conv2d() instead.")
-            # build a "node", that should be equivalent to the one given by
-            # self.make_node, but using conv3D instead of self.
-            shuffled_inputs = inputs.dimshuffle(0, 2, 3, 'x', 1)
-            if inputs.name is not None:
-                shuffled_inputs.name = 'shuffle_for_conv3D(%s)' % inputs.name
-            flipped_kerns = kerns[:, :, ::-1, ::-1]
-            if kerns.name is not None:
-                flipped_kerns.name = 'flipped(%s)' % kerns.name
-            shuffled_kerns = flipped_kerns.dimshuffle(0, 2, 3, 'x', 1)
-            if flipped_kerns.name is not None:
-                shuffled_kerns.name = 'shuffled_for_conv3D(%s)' % flipped_kerns.name
-            tmp_node = theano.tensor.nnet.conv3D(
-                V=shuffled_inputs,
-                W=shuffled_kerns,
-                b=theano.tensor.alloc(np.asarray(0, dtype=kerns.dtype),
-                                      kerns.shape[0]),
-                d=(self.dx, self.dy, 1))
-            node = theano.tensor.addbroadcast(
-                tmp_node, 3).dimshuffle(0, 4, 1, 2)
-            # mimic what happens inside theano.grad: get the input gradient
-            # of the final cost wrt all variables involved.
-            return theano.gradient.grad(cost=None, known_grads={node: gz},
-                                        wrt=[inputs, kerns])
        if self.dx not in (1, 2) or self.dy not in (1, 2):
            raise NotImplementedError(

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -30,9 +30,6 @@ from theano.tensor import opt
 # Cpu implementation
 from theano.tensor.nnet.conv import conv2d, ConvOp
-from theano.tensor.nnet.Conv3D import conv3D
-from theano.tensor.nnet.ConvGrad3D import convGrad3D
-from theano.tensor.nnet.ConvTransp3D import convTransp3D
 @gof.local_optimizer([SparseBlockGemv], inplace=True)
@@ -257,39 +254,6 @@ def local_conv2d_cpu(node):
    return [rval]
-@local_optimizer([AbstractConv3d])
-def local_conv3d_cpu(node):
-    if not isinstance(node.op, AbstractConv3d):
-        return None
-    img, kern = node.inputs
-    if ((not isinstance(img.type, TensorType) or
-         not isinstance(kern.type, TensorType))):
-        return None
-    if node.op.border_mode not in ['valid', (0, 0, 0)]:
-        return None
-    if node.op.filter_dilation != (1, 1, 1):
-        return None
-    if node.op.num_groups > 1:
-        return None
-    bias = theano.tensor.zeros_like(kern[:, 0, 0, 0, 0])
-    # need to flip the kernel if necessary (conv3D does not flip)
-    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1, ::-1]
-    # conv3D expects shape (batch, row, column, time, channel)
-    img = img.dimshuffle(0, 2, 3, 4, 1)
-    kern = kern.dimshuffle(0, 2, 3, 4, 1)
-    rval = conv3D(img, kern, bias, node.op.subsample)
-    copy_stack_trace(node.outputs[0], rval)
-    rval = rval.dimshuffle(0, 4, 1, 2, 3)
-    return [rval]
 @local_optimizer([AbstractConv2d_gradWeights])
 def local_conv2d_gradweight_cpu(node):
    if (not isinstance(node.op, AbstractConv2d_gradWeights) or
@@ -311,28 +275,7 @@ def local_conv2d_gradweight_cpu(node):
    if node.op.border_mode == 'valid' and \
            (node.op.subsample != (1, 1)):
-        # Use the gradient as defined in conv3D, because the implementation
+        return None
-        # by Conv is slow (about 3x slower than conv3D, and probably 10x
-        # slower than it could be), and incorrect when subsample > 2.
-        # build a "node", that should be equivalent to the one given by
-        # self.make_node, but using convGrad3D instead.
-        shuffled_img = img.dimshuffle(0, 2, 3, 'x', 1)
-        shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
-        rval = convGrad3D(V=shuffled_img,
-                          d=(node.op.subsample[0], node.op.subsample[1], 1),
-                          WShape=(shuffled_topgrad.shape[4],
-                                  shape[0], shape[1], 1,
-                                  shuffled_img.shape[4]),
-                          dCdH=shuffled_topgrad)
-        copy_stack_trace(node.outputs[0], rval)
-        rval = theano.tensor.addbroadcast(rval, 3)
-        rval = rval.dimshuffle(0, 4, 1, 2)
-        rval = rval[:, :, ::-1, ::-1]
-        rval = theano.tensor.patternbroadcast(rval,
-                                              node.outputs[0].broadcastable)
-        copy_stack_trace(node.outputs[0], rval)
-        return [rval]
    dx, dy = node.op.subsample
    if dx not in (1, 2) or dy not in (1, 2):
@@ -411,41 +354,6 @@ def local_conv2d_gradweight_cpu(node):
    return [res]
-@local_optimizer([AbstractConv3d_gradWeights])
-def local_conv3d_gradweight_cpu(node):
-    if not isinstance(node.op, AbstractConv3d_gradWeights):
-        return None
-    img, topgrad, shape = node.inputs
-    if ((not isinstance(img.type, TensorType) or
-         not isinstance(topgrad.type, TensorType))):
-        return None
-    if node.op.border_mode not in ['valid', (0, 0, 0)]:
-        return None
-    if node.op.filter_dilation != (1, 1, 1):
-        return None
-    if node.op.num_groups > 1:
-        return None
-    # conv3D expects shape (batch, row, column, time, channel)
-    img = img.dimshuffle(0, 2, 3, 4, 1)
-    topgrad = topgrad.dimshuffle(0, 2, 3, 4, 1)
-    W_shape = (topgrad.shape[4], shape[0], shape[1], shape[2], img.shape[4])
-    rval = convGrad3D(img, node.op.subsample, W_shape, topgrad)
-    copy_stack_trace(node.outputs[0], rval)
-    rval = rval.dimshuffle(0, 4, 1, 2, 3)
-    # need to flip the kernel if necessary (conv3D does not flip)
-    if node.op.filter_flip:
-        rval = rval[:, :, ::-1, ::-1, ::-1]
-    rval = theano.tensor.patternbroadcast(rval,
-                                          node.outputs[0].broadcastable)
-    return [rval]
 @local_optimizer([AbstractConv2d_gradInputs])
 def local_conv2d_gradinputs_cpu(node):
    if (not isinstance(node.op, AbstractConv2d_gradInputs) or
@@ -467,22 +375,8 @@ def local_conv2d_gradinputs_cpu(node):
    # Conv 3d implementation, needed when subsample > 2
    if node.op.border_mode == 'valid' and node.op.subsample != (1, 1):
-        kern = kern[:, :, ::-1, ::-1]
+        # The op don't support that anymore.
-        shuffled_kern = kern.dimshuffle(0, 2, 3, 'x', 1)
+        return False
-        shuffled_topgrad = topgrad.dimshuffle(0, 2, 3, 'x', 1)
-        b = theano.tensor.zeros_like(shuffled_kern[0, 0, 0, 0, :])
-        rval = convTransp3D(W=shuffled_kern, b=b,
-                            d=(node.op.subsample[0], node.op.subsample[1], 1),
-                            H=shuffled_topgrad,
-                            RShape=(shape[0], shape[1], 1))
-        copy_stack_trace(node.outputs[0], rval)
-        rval = theano.tensor.addbroadcast(rval, 3)
-        rval = rval.dimshuffle(0, 4, 1, 2)
-        rval = theano.tensor.patternbroadcast(rval,
-                                              node.outputs[0].broadcastable)
-        copy_stack_trace(node.outputs[0], rval)
-        return [rval]
    # Conv2d Implementation
    dx, dy = node.op.subsample
@@ -538,40 +432,6 @@ def local_conv2d_gradinputs_cpu(node):
    return [din]
-@local_optimizer([AbstractConv3d_gradInputs])
-def local_conv3d_gradinputs_cpu(node):
-    if not isinstance(node.op, AbstractConv3d_gradInputs):
-        return None
-    kern, topgrad, shape = node.inputs
-    if ((not isinstance(kern.type, TensorType) or
-         not isinstance(topgrad.type, TensorType))):
-        return None
-    if node.op.border_mode not in ['valid', (0, 0, 0)]:
-        return None
-    if node.op.filter_dilation != (1, 1, 1):
-        return None
-    if node.op.num_groups > 1:
-        return None
-    # need to flip the kernel if necessary (conv3D does not flip)
-    if node.op.filter_flip:
-        kern = kern[:, :, ::-1, ::-1, ::-1]
-    # conv3D expects shape (batch, row, column, time, channel)
-    kern = kern.dimshuffle(0, 2, 3, 4, 1)
-    topgrad = topgrad.dimshuffle(0, 2, 3, 4, 1)
-    bias = theano.tensor.zeros_like(kern[0, 0, 0, 0, :])
-    rval = convTransp3D(kern, bias, node.op.subsample, topgrad, shape)
-    copy_stack_trace(node.outputs[0], rval)
-    rval = rval.dimshuffle(0, 4, 1, 2, 3)
-    rval = theano.tensor.patternbroadcast(rval,
-                                          node.outputs[0].broadcastable)
-    return [rval]
 # Register Cpu Optmization
 conv_groupopt = theano.gof.optdb.LocalGroupDB()
 conv_groupopt.__name__ = "conv_opts"
@@ -595,6 +455,7 @@ conv_groupopt.register('local_abstractconv3d_gradweight_gemm',
 conv_groupopt.register('local_abstractconv3d_gradinputs_gemm',
                       local_abstractconv3d_gradinputs_gemm, 30,
                       'conv_gemm', 'fast_compile', 'fast_run')
 # Legacy convolution
 conv_groupopt.register('local_conv2d_cpu', local_conv2d_cpu, 40,
                       'fast_compile', 'fast_run')
@@ -604,14 +465,6 @@ conv_groupopt.register('local_conv2d_gradweight_cpu',
 conv_groupopt.register('local_conv2d_gradinputs_cpu',
                       local_conv2d_gradinputs_cpu, 40,
                       'fast_compile', 'fast_run')
-conv_groupopt.register('local_conv3d_cpu', local_conv3d_cpu, 40,
-                       'fast_compile', 'fast_run')
-conv_groupopt.register('local_conv3d_gradweight_cpu',
-                       local_conv3d_gradweight_cpu, 40,
-                       'fast_compile', 'fast_run')
-conv_groupopt.register('local_conv3d_gradinputs_cpu',
-                       local_conv3d_gradinputs_cpu, 40,
-                       'fast_compile', 'fast_run')
 # Verify that no AbstractConv are present in the graph

--- a/theano/tensor/nnet/tests/test_abstract_conv.py
+++ b/theano/tensor/nnet/tests/test_abstract_conv.py
@@ -24,14 +24,10 @@ from theano.tensor.nnet.abstract_conv import bilinear_kernel_1D
 from theano.tensor.nnet.abstract_conv import bilinear_kernel_2D
 from theano.tensor.nnet.abstract_conv import bilinear_upsampling
 from theano.tensor.nnet.abstract_conv import separable_conv2d
-from theano.tensor.nnet.conv import ConvOp
 from theano.tensor.nnet.corr import (CorrMM, CorrMM_gradWeights,
                                     CorrMM_gradInputs)
 from theano.tensor.nnet.corr3d import (Corr3dMM, Corr3dMM_gradWeights,
                                       Corr3dMM_gradInputs)
-from theano.tensor.nnet.Conv3D import Conv3D
-from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
-from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
 def conv2d_corr(inputs, filters, border_mode="valid",
@@ -794,152 +790,6 @@ class TestAbstractConvNoOptim(BaseTestConv2d):
                          ref=None, mode=mode)
-class TestCpuConv2d(BaseTestConv2d):
-    @classmethod
-    def setup(cls):
-        BaseTestConv2d.setup_class()
-        cls.mode = theano.compile.mode.get_default_mode().excluding('conv_gemm')
-        cls.opt_err = theano.config.on_opt_error
-        theano.config.on_opt_error = 'ignore'
-    @classmethod
-    def tearDown(cls):
-        theano.config.on_opt_error = cls.opt_err
-    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
-        if fd != (1, 1):
-            raise SkipTest("No dilation implementation for basic cpu ConvOp.")
-        if not theano.config.cxx:
-            raise SkipTest("Need cxx to test conv2d")
-        mode = self.mode
-        o = self.get_output_shape(i, f, s, b, fd)
-        fwd_OK = True
-        gradweight_OK = True
-        gradinput_OK = True
-        if not flip:
-            fwd_OK = False
-            gradweight_OK = False
-            gradinput_OK = False
-        if b not in ((0, 0), 'valid', 'full'):
-            fwd_OK = False
-            gradweight_OK = False
-            gradinput_OK = False
-        if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
-            gradweight_OK = False
-            gradinput_OK = False
-        if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
-            gradweight_OK = False
-            gradinput_OK = False
-        if fwd_OK:
-            # This test can run even when theano.config.blas.ldflags is empty.
-            self.run_fwd(inputs_shape=i, filters_shape=f,
-                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
-                         mode=mode, provide_shape=provide_shape,
-                         border_mode=b, filter_flip=flip, target_op=ConvOp,
-                         check_trace=True, filter_dilation=fd)
-        else:
-            assert_raises(AssertionError,
-                          self.run_fwd,
-                          inputs_shape=i,
-                          filters_shape=f,
-                          subsample=s,
-                          verify_grad=False,
-                          mode=mode,
-                          provide_shape=provide_shape,
-                          border_mode=b,
-                          filter_flip=flip,
-                          check_trace=True,
-                          filter_dilation=fd)
-        if gradweight_OK:
-            # This test can run even when theano.config.blas.ldflags is empty.
-            self.run_gradweight(inputs_shape=i, filters_shape=f,
-                                output_shape=o, subsample=s,
-                                verify_grad=False, mode=mode,
-                                provide_shape=provide_shape, border_mode=b,
-                                filter_flip=flip,
-                                target_op=(ConvOp, ConvGrad3D),
-                                check_trace=True,
-                                filter_dilation=fd)
-        else:
-            assert_raises(AssertionError,
-                          self.run_gradweight,
-                          inputs_shape=i,
-                          filters_shape=f,
-                          output_shape=o,
-                          subsample=s,
-                          verify_grad=False,
-                          mode=mode,
-                          provide_shape=provide_shape,
-                          border_mode=b,
-                          filter_flip=flip,
-                          check_trace=True,
-                          filter_dilation=fd)
-        if gradinput_OK:
-            # This test can run even when theano.config.blas.ldflags is empty.
-            self.run_gradinput(inputs_shape=i, filters_shape=f,
-                               output_shape=o, subsample=s,
-                               verify_grad=False, mode=mode,
-                               provide_shape=provide_shape, border_mode=b,
-                               filter_flip=flip,
-                               target_op=(ConvOp, ConvTransp3D),
-                               check_trace=True,
-                               filter_dilation=fd)
-        else:
-            assert_raises(AssertionError,
-                          self.run_gradinput,
-                          inputs_shape=i,
-                          filters_shape=f,
-                          output_shape=o,
-                          subsample=s,
-                          verify_grad=False,
-                          mode=mode,
-                          provide_shape=provide_shape,
-                          border_mode=b,
-                          filter_flip=flip,
-                          check_trace=True,
-                          filter_dilation=fd)
-    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1), expect_error=False):
-        if not theano.config.cxx:
-            raise SkipTest("Need cxx to test conv2d")
-        if fd != (1, 1):
-            raise SkipTest("No dilation implementation for basic cpu ConvOp.")
-        mode = self.mode
-        if not flip:
-            return
-        if b not in ((0, 0), 'valid', 'full'):
-            return
-        if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
-            return
-        if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
-            return
-        if not expect_error:
-            self.run_gradinput(inputs_shape=i, filters_shape=f,
-                               output_shape=o, subsample=s,
-                               verify_grad=False, mode=mode,
-                               provide_shape=provide_shape, border_mode=b,
-                               filter_flip=flip,
-                               target_op=(ConvOp, ConvTransp3D),
-                               check_trace=True,
-                               filter_dilation=fd)
-        else:
-            # we do not check for inconsistent shapes,
-            # because this older implementation does not check that
-            raise SkipTest('Inconsistent shapes are not tested for old cpu ConvOp.')
 class BaseTestConv3d(BaseTestConv):
    @classmethod
    def setup_class(cls):
@@ -1098,134 +948,6 @@ class TestCorrConv3d(BaseTestConv3d):
                          ref=None, check_trace=True, filter_dilation=fd)
-class TestCpuConv3d(BaseTestConv3d):
-    @classmethod
-    def setup(cls):
-        BaseTestConv3d.setup_class()
-        cls.mode = theano.compile.mode.get_default_mode().excluding('conv_gemm')
-        cls.opt_err = theano.config.on_opt_error
-        theano.config.on_opt_error = 'ignore'
-    @classmethod
-    def tearDown(cls):
-        theano.config.on_opt_error = cls.opt_err
-    def tcase(self, i, f, s, b, flip, provide_shape, fd=(1, 1, 1)):
-        if fd != (1, 1, 1):
-            raise SkipTest("No dilation implementation for basic cpu Conv3D.")
-        if not theano.config.cxx:
-            raise SkipTest("Need cxx to test conv2d")
-        if min(i) == 0 or min(f) == 0:
-            raise SkipTest('Not tested for old cpu Conv3D.')
-        mode = self.mode
-        o = self.get_output_shape(i, f, s, b, fd)
-        fwd_OK = True
-        gradweight_OK = True
-        gradinput_OK = True
-        if b not in ((0, 0, 0), 'valid'):
-            fwd_OK = False
-            gradweight_OK = False
-            gradinput_OK = False
-        if fwd_OK:
-            # This test can run even when theano.config.blas.ldflags is empty.
-            self.run_fwd(inputs_shape=i, filters_shape=f,
-                         subsample=s, verify_grad=(gradweight_OK and gradinput_OK),
-                         mode=mode, provide_shape=provide_shape,
-                         border_mode=b, filter_flip=flip, target_op=Conv3D,
-                         check_trace=True, filter_dilation=fd)
-        else:
-            assert_raises(AssertionError,
-                          self.run_fwd,
-                          inputs_shape=i,
-                          filters_shape=f,
-                          subsample=s,
-                          verify_grad=False,
-                          mode=mode,
-                          provide_shape=provide_shape,
-                          border_mode=b,
-                          filter_flip=flip,
-                          check_trace=True,
-                          filter_dilation=fd)
-        if gradweight_OK:
-            # This test can run even when theano.config.blas.ldflags is empty.
-            self.run_gradweight(inputs_shape=i, filters_shape=f,
-                                output_shape=o, subsample=s,
-                                verify_grad=False, mode=mode,
-                                provide_shape=provide_shape, border_mode=b,
-                                filter_flip=flip,
-                                target_op=ConvGrad3D,
-                                check_trace=True,
-                                filter_dilation=fd)
-        else:
-            assert_raises(AssertionError,
-                          self.run_gradweight,
-                          inputs_shape=i,
-                          filters_shape=f,
-                          output_shape=o,
-                          subsample=s,
-                          verify_grad=False,
-                          mode=mode,
-                          provide_shape=provide_shape,
-                          border_mode=b,
-                          filter_flip=flip,
-                          check_trace=True,
-                          filter_dilation=fd)
-        if gradinput_OK:
-            # This test can run even when theano.config.blas.ldflags is empty.
-            self.run_gradinput(inputs_shape=i, filters_shape=f,
-                               output_shape=o, subsample=s,
-                               verify_grad=False, mode=mode,
-                               provide_shape=provide_shape, border_mode=b,
-                               filter_flip=flip,
-                               target_op=ConvTransp3D,
-                               check_trace=True,
-                               filter_dilation=fd)
-        else:
-            assert_raises(AssertionError,
-                          self.run_gradinput,
-                          inputs_shape=i,
-                          filters_shape=f,
-                          output_shape=o,
-                          subsample=s,
-                          verify_grad=False,
-                          mode=mode,
-                          provide_shape=provide_shape,
-                          border_mode=b,
-                          filter_flip=flip,
-                          check_trace=True,
-                          filter_dilation=fd)
-    def tcase_gi(self, i, f, o, s, b, flip, provide_shape, fd=(1, 1, 1), expect_error=False):
-        if fd != (1, 1, 1):
-            raise SkipTest("No dilation implementation for basic cpu Conv3D.")
-        mode = self.mode
-        if min(i) == 0 or min(f) == 0 or min(o) == 0:
-            raise SkipTest('Not tested for old cpu Conv3D.')
-        if b not in ((0, 0, 0), 'valid'):
-            return
-        if not expect_error:
-            self.run_gradinput(inputs_shape=i, filters_shape=f,
-                               output_shape=o, subsample=s,
-                               verify_grad=False, mode=mode,
-                               provide_shape=provide_shape, border_mode=b,
-                               filter_flip=flip,
-                               target_op=ConvTransp3D,
-                               check_trace=True,
-                               filter_dilation=fd)
-        else:
-            # we do not check for inconsistent shapes,
-            # because this older implementation does not check that
-            raise SkipTest('Inconsistent shapes are not tested for old cpu Conv3D.')
 def test_constant_shapes():
    # Check that the `imshp` and `kshp` parameters of the AbstractConv Ops
    # are rejected if not constant or None

--- a/theano/tensor/nnet/tests/test_conv.py
+++ b/theano/tensor/nnet/tests/test_conv.py
@@ -294,15 +294,17 @@ class TestConv2D(utt.InferShapeTester):
        """
        Tests convolution where subsampling != (1,1)
        """
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', subsample=(2, 2))
        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'full', subsample=(2, 2))
-        self.validate((3, 2, 7, 5), (5, 2, 2, 3), 'valid', subsample=(2, 1))
-        self.validate((1, 1, 6, 6), (1, 1, 3, 3), 'valid', subsample=(3, 3))
        # Fails as of 2012-07-11
        self.assertRaises(NotImplementedError, self.validate, (1, 1, 6, 6),
                          (1, 1, 3, 3), 'full', subsample=(3, 3))
+        # Fails as of 2017-08-10
+        self.assertRaises(NotImplementedError, self.validate, (3, 2, 7, 5), (5, 2, 2, 3), 'valid', subsample=(2, 2))
+        self.assertRaises(NotImplementedError, self.validate, (3, 2, 7, 5), (5, 2, 2, 3), 'valid', subsample=(2, 1))
+        self.assertRaises(NotImplementedError, self.validate, (1, 1, 6, 6), (1, 1, 3, 3), 'valid', subsample=(3, 3))
    def test_shape_Constant_tensor(self):
        """
        Tests convolution where the {image,filter}_shape is a Constant tensor.
@@ -604,9 +606,6 @@ class TestConv2D(utt.InferShapeTester):
            excluding=['conv_gemm'])
-class TestDefaultConv2D(TestConv2D):
-    conv2d = staticmethod(theano.tensor.nnet.conv2d)
 # Test that broadcasting of gradients works correctly when using the
 # nnet.conv2d() interface. This was reported in #3763, and uses the example
 # code from that ticket.

--- a/theano/tensor/nnet/tests/test_conv3d.py
+++ b/theano/tensor/nnet/tests/test_conv3d.py
-from __future__ import absolute_import, print_function, division
-import theano
-import theano.tensor as T
-from theano import function, shared
-from theano.tests import unittest_tools as utt
-from theano.tensor.nnet.ConvTransp3D import convTransp3D, ConvTransp3D
-from theano.tensor.nnet.ConvGrad3D import convGrad3D, ConvGrad3D
-from theano.tensor.nnet.Conv3D import conv3D, Conv3D
-from theano.tests.unittest_tools import attr
-from nose.plugins.skip import SkipTest
-import numpy as N
-from six.moves import xrange
-import copy
-import theano.sparse
-if theano.sparse.enable_sparse:
-    from scipy import sparse
-floatX = theano.config.floatX
-# TODO: each individual test method should seed rng with utt.fetch_seed()
-#      as it is right now, setUp does the seeding, so if you run just
-#      a subset of the tests they will do different things than if you
-#      run all of them
-class DummyConv3D:
-    """A dummy version of Conv3D passed to verify_grad
-    Stores a fixed stride, since stride is not differentiable
-    Exposes only one scalar argument, which is used as the position
-    along a parametrically defined line, with 0 being at VwbVals
-    Direction of the line is chosen randomly at construction
-    The reason for locking the inputs to lie on this line is so that the
-    verify_grad will not need to test hundreds of variables. Disadvantage
-    is we can't be certain that all of them are correct, advantange is that
-    this random projection lets us test lots of variables very quickly """
-    def __init__(self, rng, VWbVals, d):
-        """
-        param: rng    Random number generator used to pick direction of the
-            line
-        param: VWbVals    tuple containing values to test V,W,b around
-        param: d    shared variable for d, the stride
-        """
-        self.V, self.W, self.b = VWbVals
-        self.dV = shared(rng.uniform(-1, 1,
-                                     self.V.get_value(borrow=True).shape))
-        self.dW = shared(rng.uniform(-1, 1,
-                                     self.W.get_value(borrow=True).shape))
-        self.db = shared(rng.uniform(-1, 1,
-                                     self.b.get_value(borrow=True).shape))
-        self.d = d
-    def __call__(self, t):
-        output = conv3D(self.V + t * self.dV, self.W + t * self.dW,
-                        self.b + t * self.db, self.d)
-        return output
-class DummyConvGrad3D:
-    def __init__(self, rng, VdHvals, d, WShape):
-        """
-        param: rng    Random number generator used to pick direction of the
-            line
-        param: VWbVals    tuple containing values to test V,W,b around
-        param: d    shared variable for d, the stride
-        """
-        self.V, self.dCdH = VdHvals
-        self.dV = shared(rng.uniform(-1, 1,
-                                     self.V.get_value(borrow=True).shape))
-        self.ddCdH = shared(rng.uniform(
-                            -1, 1, self.dCdH.get_value(borrow=True).shape))
-        self.d = d
-        self.WShape = WShape
-    def __call__(self, t):
-        output = convGrad3D(self.V + t * self.dV, self.d, self.WShape,
-                            self.dCdH + t * self.ddCdH)
-        return output
-class DummyConvTransp3D:
-    def __init__(self, rng, WbHvals, d, RShape):
-        """
-        param: rng    Random number generator used to pick direction of the
-            line
-        param: VWbVals    tuple containing values to test V,W,b around
-        param: d    shared variable for d, the stride
-        """
-        self.W, self.b, self.H = WbHvals
-        self.dW = rng.uniform(-1, 1, self.W.get_value(borrow=True).shape)
-        self.db = rng.uniform(-1, 1, self.b.get_value(borrow=True).shape)
-        self.dH = rng.uniform(-1, 1, self.H.get_value(borrow=True).shape)
-        self.dW, self.db = shared(self.dW), shared(self.db),
-        self.dH = shared(self.dH)
-        self.d = d
-        self.RShape = RShape
-    def __call__(self, t):
-        output = convTransp3D(self.W + t * self.dW, self.b + t * self.db,
-                              self.d, self.H + t * self.dH, self.RShape)
-        return output
-class TestConv3D(utt.InferShapeTester):
-    def setUp(self):
-        super(TestConv3D, self).setUp()
-        utt.seed_rng()
-        self.rng = N.random.RandomState(utt.fetch_seed())
-        mode = copy.copy(theano.compile.mode.get_default_mode())
-        mode.check_py_code = False
-        self.W = shared(N.ndarray(shape=(1, 1, 1, 1, 1), dtype=floatX))
-        self.W.name = 'W'
-        self.b = shared(N.zeros(1, dtype=floatX))
-        self.b.name = 'b'
-        self.rb = shared(N.zeros(1, dtype=floatX))
-        self.rb.name = 'rb'
-        self.V = shared(N.ndarray(shape=(1, 1, 1, 1, 1), dtype=floatX))
-        self.V.name = 'V'
-        self.d = shared(N.ones(shape=(3, ), dtype=int))
-        self.d.name = 'd'
-        self.H = conv3D(self.V, self.W, self.b, self.d)
-        self.H.name = 'H'
-        self.H_func = function([], self.H, mode=mode)
-        self.H_shape_func = function([], self.H.shape, mode=mode)
-        self.RShape = T.vector(dtype='int64')
-        self.RShape.name = 'RShape'
-        self.otherH = T.TensorType(
-            floatX, (False, False, False, False, False))(name='otherH')
-        self.transp = convTransp3D(self.W, self.rb, self.d,
-                                   self.otherH, self.RShape)
-        self.transp.name = 'transp'
-        self.transp_func = function([self.otherH, self.RShape],
-                                    self.transp, mode=mode)
-        self.R = convTransp3D(self.W, self.rb, self.d, self.H, self.RShape)
-        self.R.name = 'R'
-        self.R_func = function([self.RShape], self.R, mode=mode)
-        self.R_shape_func = function([self.RShape], self.R.shape)
-        diff = self.V - self.R
-        diff.name = 'diff'
-        sqr = T.sqr(diff)
-        sqr.name = 'sqr'
-        self.reconsObj = T.sum(sqr)
-        self.reconsObj.name = 'reconsObj'
-        self.reconsObjFunc = function([self.RShape], self.reconsObj, mode=mode)
-        W_grad = T.grad(self.reconsObj, self.W)
-        self.gradientsFunc = function(
-            [self.RShape],
-            [W_grad, T.grad(self.reconsObj, self.H), T.grad(self.reconsObj, self.V),
-                T.grad(self.reconsObj, self.b)], mode=mode)
-        self.check_c_against_python = function(
-            [self.RShape],
-            [T.grad(self.reconsObj, self.W), T.grad(self.reconsObj, self.H), T.grad(self.reconsObj, self.V),
-                T.grad(self.reconsObj, self.b)], mode='DEBUG_MODE')
-        self.dCdW_shape_func = function([self.RShape], T.grad(self.reconsObj, self.W).shape, mode=mode)
-    def random_tensor(self, *dims):
-        return N.asarray(self.rng.uniform(-.05, .05, dims), dtype=floatX)
-    def randomize(self):
-        batchSize = self.rng.randint(1, 4)
-        videoDur = self.rng.randint(8, 30)
-        filterWidth = self.rng.randint(1, 8)
-        filterHeight = self.rng.randint(1, 8)
-        filterDur = self.rng.randint(1, 8)
-        tsteps = self.rng.randint(1, 4)
-        rsteps = self.rng.randint(1, 4)
-        csteps = self.rng.randint(1, 4)
-        videoDur = tsteps * filterDur + self.rng.randint(0, 3)
-        videoWidth = csteps * filterWidth + self.rng.randint(0, 3)
-        videoHeight = rsteps * filterHeight + self.rng.randint(0, 3)
-        numFilters = self.rng.randint(1, 3)
-        inputChannels = self.rng.randint(1, 3)
-        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
-            self.rng.randint(1, 15)
-        int((videoHeight - filterHeight) /
-            self.d.get_value(borrow=True)[0]) + 1
-        int((videoWidth - filterWidth) /
-            self.d.get_value(borrow=True)[1]) + 1
-        int((videoDur - filterDur) /
-            self.d.get_value(borrow=True)[2]) + 1
-        self.W.set_value(self.random_tensor(
-            numFilters, filterHeight,
-            filterWidth, filterDur, inputChannels), borrow=True)
-        self.b.set_value(self.random_tensor(numFilters), borrow=True)
-        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)
-        self.V.set_value(self.random_tensor(
-            batchSize, videoHeight,
-            videoWidth, videoDur, inputChannels), borrow=True)
-        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)
-    def test_c_against_python(self):
-        self.randomize()
-        self.check_c_against_python(self.V.get_value(borrow=True).shape[1:4])
-    @attr('slow')
-    def test_c_against_mat_mul(self):
-        # Use a filter of the same size as the image, so the convolution is
-        # just a dense matrix multiply.
-        # Check that dense matrix multiplication gives the same result as
-        # convolution.
-        batchSize = self.rng.randint(1, 10)
-        videoDur = self.rng.randint(3, 10)
-        videoWidth = self.rng.randint(1, 5)
-        videoHeight = self.rng.randint(1, 5)
-        filterWidth = videoWidth
-        filterHeight = videoHeight
-        filterDur = videoDur
-        numFilters = self.rng.randint(1, 3)
-        inputChannels = self.rng.randint(1, 4)
-        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
-            self.rng.randint(1, 15)
-        self.W.set_value(self.random_tensor(
-            numFilters, filterHeight,
-            filterWidth, filterDur, inputChannels), borrow=True)
-        self.W.set_value(
-            self.W.get_value(borrow=True) *
-            (self.W.get_value(borrow=True) < 1e-5), borrow=True)
-        self.b.set_value(self.random_tensor(numFilters), borrow=True)
-        self.V.set_value(self.random_tensor(
-            batchSize, videoHeight, videoWidth, videoDur, inputChannels), borrow=True)
-        Hv = self.H_func()
-        assert Hv.shape[1] == 1
-        assert Hv.shape[2] == 1
-        assert Hv.shape[3] == 1
-        n = inputChannels * videoHeight * videoWidth * videoDur
-        W_mat = N.zeros((n, numFilters))
-        V_mat = N.zeros((batchSize, n))
-        Hv_mat = N.zeros((batchSize, numFilters))
-        for qi in xrange(0, numFilters):
-            W_mat[:, qi] = \
-                self.W.get_value(borrow=True)[qi, :, :, :, :].reshape((n))
-            Hv_mat[:, qi] = Hv[:, 0, 0, 0, qi]
-        for qi in xrange(0, batchSize):
-            V_mat[qi, :] = \
-                self.V.get_value(borrow=True)[qi, :, :, :, :].reshape((n))
-        H_mat = N.dot(V_mat, W_mat) + self.b.get_value(borrow=True)
-        tol = 1e-5
-        if floatX == 'float32':
-            tol = 1e-4
-        if N.abs(H_mat - Hv_mat).max() > tol and not N.allclose(H_mat, Hv_mat):
-            print(H_mat)
-            print(Hv_mat)
-            print('max error: ' + str(N.abs(H_mat - Hv_mat).max()))
-            self.W.get_value(borrow=True)[self.W.get_value(borrow=True) != 0] += 1.0
-            print('min non-zero kernel mag: ' + str(
-                N.abs(self.W.get_value(borrow=True)).min()))
-            assert False
-    def test_c_against_mat_transp_mul(self):
-        # Use a filter of the same size as the image, so the convolution is just a
-        # dense matrix multiply.
-        # Check that dense matrix multiplication by the transpose of the matrix
-        # gives the same result as ConvTransp.
-        batchSize = self.rng.randint(1, 10)
-        videoDur = self.rng.randint(3, 15)
-        videoWidth = self.rng.randint(3, 15)
-        videoHeight = self.rng.randint(3, 15)
-        filterWidth = videoWidth
-        filterHeight = videoHeight
-        filterDur = videoDur
-        numFilters = self.rng.randint(1, 15)
-        inputChannels = self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
-            self.rng.randint(1, 15)
-        self.W.set_value(self.random_tensor(
-            numFilters, filterHeight,
-            filterWidth, filterDur, inputChannels), borrow=True)
-        self.b.set_value(self.random_tensor(numFilters), borrow=True)
-        self.V.set_value(self.random_tensor(
-            batchSize, videoHeight,
-            videoWidth, videoDur, inputChannels), borrow=True)
-        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)
-        H_shape = self.H_shape_func()
-        assert H_shape[1] == 1
-        assert H_shape[2] == 1
-        assert H_shape[3] == 1
-        Hv = self.random_tensor(* H_shape)
-        Vv = self.transp_func(Hv, [videoHeight, videoWidth, videoDur])
-        n = inputChannels * videoHeight * videoWidth * videoDur
-        rbim = N.zeros((videoHeight, videoWidth, videoDur, inputChannels))
-        for qi in xrange(0, inputChannels):
-            rbim[:, :, :, qi] = self.rb.get_value(borrow=True)[qi]
-        rbv = rbim.reshape((n))
-        W_mat = N.zeros((numFilters, n))
-        Vv_mat = N.zeros((n, batchSize))
-        Hv_mat = N.zeros((numFilters, batchSize))
-        for qi in xrange(0, numFilters):
-            W_mat[qi, :] = \
-                self.W.get_value(borrow=True)[qi, :, :, :, :].reshape((n))
-            Hv_mat[qi, :] = Hv[:, 0, 0, 0, qi]
-        for qi in xrange(0, batchSize):
-            Vv_mat[:, qi] = Vv[qi, :, :, :, :].reshape((n))
-        V_mat = (N.dot(W_mat.transpose(), Hv_mat).transpose() +
-                 rbv).transpose()
-        if N.abs(V_mat - Vv_mat).max() > 1e-5:
-            print(V_mat)
-            print(Vv_mat)
-            for qq in xrange(V_mat.shape[0]):
-                for qqq in xrange(Vv_mat.shape[1]):
-                    if abs(V_mat[qq, qqq] - Vv_mat[qq, qqq]) > 1e-5:
-                        print(
-                            ('wrong at ' + str((qq, qqq)) + ': ' +
-                                str(V_mat[qq, qqq], Vv_mat[qq, qqq])))
-                        assert False
-    def test_c_against_sparse_mat_transp_mul(self):
-        # like test_c_against_mat_transp_mul but using a sparse matrix and a kernel
-        # that is smaller than the image
-        if not theano.sparse.enable_sparse:
-            raise SkipTest('Optional package sparse disabled')
-        batchSize = self.rng.randint(1, 3)
-        filterWidth = self.rng.randint(1, 8)
-        filterHeight = self.rng.randint(1, 8)
-        filterDur = self.rng.randint(1, 8)
-        self.d.get_value(borrow=True, return_internal_type=True)[0] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[1] = \
-            self.rng.randint(1, 15)
-        self.d.get_value(borrow=True, return_internal_type=True)[2] = \
-            self.rng.randint(1, 15)
-        dr = self.d.get_value(borrow=True)[0]
-        dc = self.d.get_value(borrow=True)[1]
-        dt = self.d.get_value(borrow=True)[2]
-        numFilters = self.rng.randint(1, 3)
-        row_steps = self.rng.randint(1, 4)
-        col_steps = self.rng.randint(1, 4)
-        time_steps = self.rng.randint(1, 4)
-        # print (row_steps,col_steps,time_steps)
-        videoDur = (time_steps - 1) * dt + filterDur + self.rng.randint(0, 3)
-        videoWidth = (col_steps - 1) * dc + filterWidth + self.rng.randint(0, 3)
-        videoHeight = (row_steps - 1) * dr + filterHeight + self.rng.randint(0, 3)
-        inputChannels = self.rng.randint(1, 15)
-        self.W.set_value(self.random_tensor(
-            numFilters, filterHeight,
-            filterWidth, filterDur, inputChannels), borrow=True)
-        self.b.set_value(self.random_tensor(numFilters), borrow=True)
-        # just needed so H_shape works
-        self.V.set_value(self.random_tensor(
-            batchSize, videoHeight, videoWidth,
-            videoDur, inputChannels), borrow=True)
-        self.rb.set_value(self.random_tensor(inputChannels), borrow=True)
-        H_shape = self.H_shape_func()
-        # make index maps
-        h = N.zeros(H_shape[1:], dtype='int32')
-        r = N.zeros(H_shape[1:], dtype='int32')
-        c = N.zeros(H_shape[1:], dtype='int32')
-        t = N.zeros(H_shape[1:], dtype='int32')
-        for qi in xrange(0, H_shape[4]):
-            h[:, :, :, qi] = qi
-        for qi in xrange(0, H_shape[1]):
-            r[qi, :, :, :] = qi
-        for qi in xrange(0, H_shape[2]):
-            c[:, qi, :, :] = qi
-        for qi in xrange(0, H_shape[3]):
-            t[:, :, qi, :] = qi
-        hn = H_shape[1] * H_shape[2] * H_shape[3] * H_shape[4]
-        h = h.reshape((hn))
-        r = r.reshape((hn))
-        c = c.reshape((hn))
-        t = t.reshape((hn))
-        Hv = self.random_tensor(*H_shape)
-        Vv = self.transp_func(Hv, [videoHeight, videoWidth, videoDur])
-        n = inputChannels * videoHeight * videoWidth * videoDur
-        rbim = N.zeros((videoHeight, videoWidth, videoDur, inputChannels))
-        for qi in xrange(0, inputChannels):
-            rbim[:, :, :, qi] = self.rb.get_value(borrow=True)[qi]
-        rbv = rbim.reshape((n))
-        W_mat = N.zeros((hn, n))
-        Vv_mat = N.zeros((n, batchSize))
-        Hv_mat = N.zeros((hn, batchSize))
-        for qi in xrange(0, hn):
-            hi = h[qi]
-            ri = r[qi]
-            ci = c[qi]
-            ti = t[qi]
-            placed_filter = N.zeros(self.V.get_value(borrow=True).shape[1:])
-            placed_filter[
-                ri * dr:ri * dr + self.W.get_value(borrow=True).shape[1],
-                ci * dc:ci * dc + self.W.get_value(borrow=True).shape[2],
-                ti * dt:ti * dt + self.W.get_value(borrow=True).shape[3],
-                :] = self.W.get_value(borrow=True)[hi, :, :, :, :]
-            W_mat[qi, :] = placed_filter.reshape((n))
-            Hv_mat[qi, :] = Hv[:, ri, ci, ti, hi]
-        for qi in xrange(0, batchSize):
-            Vv_mat[:, qi] = Vv[qi, :, :, :, :].reshape((n))
-        W_mat_T = sparse.csr_matrix(W_mat.transpose())
-        temp = W_mat_T * Hv_mat
-        V_mat = (temp.transpose() + rbv).transpose()
-        if N.abs(V_mat - Vv_mat).max() > 1e-5:
-            print('mul')
-            print(V_mat)
-            print('conv')
-            print(Vv_mat)
-            for i in xrange(0, n):
-                for j in xrange(0, batchSize):
-                    if abs(V_mat[i, j] - Vv_mat[i, j]) > 1e-5:
-                        print(('wrong at %d,%d: %f mul versus %f conv'
-                               % (i, j, V_mat[i, j], Vv_mat[i, j])))
-            assert False
-    def test_infer_shape(self):
-        self.randomize()
-        # Conv3D
-        self._compile_and_check([], [self.H], [], Conv3D)
-        # ConvTransp3D
-        self._compile_and_check(
-            [self.RShape], [self.R],
-            [self.V.get_value(borrow=True).shape[1:4]], ConvTransp3D)
-        # ConvGrad3D
-        self._compile_and_check(
-            [self.RShape],
-            [T.grad(self.reconsObj, self.W), T.grad(self.reconsObj, self.H),
-                T.grad(self.reconsObj, self.V), T.grad(self.reconsObj, self.b)],
-            [self.V.get_value(borrow=True).shape[1:4]], ConvGrad3D)
-    def test_gradient(self):
-        self.randomize()
-        rng, V, W, b, d, rb = self.rng, self.V, self.W, self.b, self.d, self.rb
-        dCdH = shared(self.random_tensor(*self.H_shape_func()))
-        testsPerDir = 2
-        theano.tests.unittest_tools.verify_grad(DummyConv3D(
-            rng, (V, W, b), d), [0.0], n_tests=testsPerDir)
-        theano.tests.unittest_tools.verify_grad(
-            DummyConvTransp3D(
-                rng, (W, rb, dCdH), d, V.get_value(borrow=True).shape[1:4]),
-            [0.0], n_tests=testsPerDir)
-        theano.tests.unittest_tools.verify_grad(
-            DummyConvGrad3D(
-                rng, (V, dCdH), d, W.get_value(borrow=True).shape),
-            [0.0], n_tests=testsPerDir)
-if __name__ == '__main__':
-    t = TestConv3D('setUp')
-    t.setUp()
-    t.test_infer_shape()