added Conv3D and related ops

023320fe · Ian Goodfellow · e8c50c78 · 023320fe · 023320fe · 023320fe
--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
+import theano
+import theano.tensor as T
+import numpy as N
+from theano.sandbox.cuda import cuda_available, cuda_enabled
+from theano.sandbox.cuda.basic_ops import *
+from util import strutil
+from .Conv3D import Conv3D
+if cuda_available:
+    from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
+class GpuConv3D(theano.Op):
+    """ GPU implementation of Conv3D """
+    def __eq__(self, other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return '%s' % (self.__class__.__name__)
+    def make_node(self, V, W, b, d):
+        """
+            :param V: Visible unit, input
+            :param W: Weights, filter
+            :param b: bias
+            :param d: strides when moving the filter over the input
+        """
+        V_ = as_cuda_ndarray_variable(V)
+        W_ = as_cuda_ndarray_variable(W)
+        b_ = as_cuda_ndarray_variable(b)
+        d_ = T.as_tensor_variable(d)
+        return theano.Apply(self, inputs=[V_, W_, b_, d_], 
+                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
+        fail = sub['fail']
+        H = outputs[0]
+        codeSource =  """
+			///////////// < code generated by GpuConv3D >
+			//printf("\t\t\t\tConv3DGPU c code\\n");
+			//Check dimensionality of inputs
+			if (%(W)s->nd != 5)
+			{
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); 
+                            %(fail)s
+			}
+			if (%(V)s->nd != 5)
+			{
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); 
+                            %(fail)s
+			}
+			if (%(b)s->nd != 1)
+			{
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); 
+                            %(fail)s
+			}
+			if (%(d)s->nd != 1)
+			{
+PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); 
+                            %(fail)s
+			}
+			if (%(d)s->dimensions[0] != 3)
+			{
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); 
+                            %(fail)s
+			}
+{ //extra scope so fail doesn't jump over declarations
+			//Read and check sizes of inputs
+			const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
+			const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];
+			const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
+			if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
+			{
+                            PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); 
+                            %(fail)s
+			}
+{  //extra scope so error handler jumps don't cause errors
+			const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
+			const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
+			const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
+			const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
+			const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
+			const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
+            if (vidHeight < filterHeight)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            if (vidWidth < filterWidth)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            if (vidDur < filterDur)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
+                %(fail)s
+            }
+{ // extra scope so fail works
+			//Read and check stride arguments
+                        const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
+                        const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
+                        const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
+			if (dr <= 0 || dc <= 0 || dt <= 0)
+			{
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); 
+                %(fail)s
+			}
+{ // extra scope so fail works
+			//Make correctly sized output
+			const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
+			const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
+			const int outputDur = int( (vidDur - filterDur) / dt ) +1;
+			npy_intp dims[5];
+			dims[0] = batchSize;
+			dims[4] = outputChannels;
+			dims[1] = outputHeight;
+			dims[2] = outputWidth;
+			dims[3] = outputDur;
+			if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || 
+			CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || 
+			CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || 
+			CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || 
+			CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
+				Py_XDECREF(%(H)s);
+				%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
+				if (!(%(H)s)) {
+                    PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); 
+                            %(fail)s
+				}
+			}
+{ // extra scope so fail will not cross declarations
+			//#define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )####################
+			const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
+			const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
+			const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
+			const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
+			const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
+			const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
+			const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
+			const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
+			const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
+			const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
+			// Compute H
+			//H[i,x,y,t,j] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i, dr*r+k,dc*c+l,dt*t+m,z]
+bool out_contiguous = CudaNdarray_is_c_contiguous(%(H)s);
+int version = -1;
+int verbose = 0;
+bool subsample =(dr>1)||(dc>1)||(dt>1);
+bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
+bool work_complete = false;
+if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 && !work_complete){
+    //conv_rows_stack
+    dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
+    dim3 threads(outputDur);
+    int shared_size=0;
+        conv_rows_stack<<<grid, threads, shared_size>>>(
+        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
+        vidHeight, vidWidth, vidDur,
+        filterHeight, filterWidth, filterDur,
+        outputChannels, inputChannels,
+        dr,dc,dt,
+        vs3,vs2,vs1,vs4,vs0,
+        ws3,ws2,ws1,ws4,ws0);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+	    if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
+            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
+                    cudaGetErrorString(sts));
+            %(fail)s
+        }      
+}
+if(!work_complete){
+            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
+            %(fail)s
+}
+}}}}}}} //extra scope so error handler jumps don't cross declarations    
+			///////////// < /code generated by GpuConv3D >
+        """
+        return strutil.renderString(codeSource,locals()) 
+    def c_support_code_apply(self, node, nodename):
+        # This code is not sensitive to the ignore_border flag.
+        # It runs for every position in the output z, and then computes the gradient for the
+        # input pixels that were downsampled to that z-position.
+        codeSource =  """
+__global__ void
+//thread block size = out_dur
+//grid block size =(out_len*out_wid, nb kern *nb batch)
+//
+conv_rows_stack( float* img, float* kern, float* bias, float* out,
+		 int img_len, int img_wid, int img_dur,
+                 int kern_height, int kern_wid, int kern_dur,
+		 int nkern, int input_channels,
+                 int dr, int dc, int dt,
+                 int img_stride_frame, int img_stride_col, int img_stride_row,
+		 int img_stride_ochannel, int img_stride_batch,
+                 int kern_stride_frame, int kern_stride_col, int kern_stride_row,
+		 int kern_stride_stack, int kern_stride_okern)
+{
+  int __shared__ out_len, out_wid, out_dur, batch_id, kern_id;
+  float  __shared__ *d_img, *d_kern;
+  out_len = int( (img_len - kern_height) / dr )+1;
+  out_wid = int( (img_wid - kern_wid) / dc )+1;
+  out_dur = int( (img_dur - kern_dur) / dt )+1;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y - batch_id*nkern;
+  const int out_row = blockIdx.x%out_len;
+  const int out_col = blockIdx.x/out_len;
+  const int out_frame=threadIdx.x;
+  img += batch_id*img_stride_batch + out_row*dr*img_stride_row + out_col*dc*img_stride_col+out_frame*dt*img_stride_frame;
+  kern += kern_id*kern_stride_okern;
+    float sum = 0.0f;
+    for (int z = 0; z < input_channels; z++) {//1 for first layer
+	for (int k =0; k < kern_height; k++) {
+	  for (int l = 0; l < kern_wid; l++) {
+	    for (int m = 0; m < kern_dur; m++) {
+	      sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] * 
+		         kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
+            }
+          }
+        }
+      out[batch_id*nkern*out_len*out_wid*out_dur+//the good batch
+	  out_frame*nkern+//the output frame
+	  out_row*out_wid*out_dur*nkern+//the output row
+          out_col*out_dur*nkern + //the output_col
+          kern_id //the output image (channel)
+] = sum + bias[kern_id];
+    }
+}
+            """
+        return codeSource#renderString(codeSource,locals()) 
+gpu_convd = GpuConv3D()
+@theano.sandbox.cuda.opt.register_opt()
+@theano.gof.opt.local_optimizer([])
+def local_gpu_conv3d(node):
+    if isinstance(node.op, Conv3D):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
+            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
+                V, W, b, d = node.inputs
+                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),as_cuda_ndarray_variable(W), as_cuda_ndarray_variable(b), d))]
--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
+import theano
+import theano.tensor as T
+import numpy as N
+from theano.sandbox.cuda import cuda_available, cuda_enabled
+from theano.sandbox.cuda.basic_ops import *
+from util import strutil
+from .ConvGrad3D import ConvGrad3D
+class GpuConvGrad3D(theano.Op):
+    """ GPU version of gradient of ConvGrad3D with respect to W """
+    def make_node(self, V, d, WShape, dCdH):
+        """
+        :param V: visible
+        :param d: strides
+        :param WShape: shapes of the weights -> shape of this op output
+        :param dCdH: other input with what V will be convolved.
+        """
+        V_ = as_cuda_ndarray_variable(V)
+        d_ = T.as_tensor_variable(d)
+        WShape_ = T.as_tensor_variable(WShape)
+        dCdH_ = as_cuda_ndarray_variable(dCdH)
+        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], 
+                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])
+    def perform_(self, node, inputs, output_storage):
+        V, d, WShape, dCdH = inputs
+        print "GpuConvGrad3D python code (warning not updated to new format)"
+        #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m] 
+        batchSize = dCdH.shape[0]
+        outputFilters = dCdH.shape[1]
+        outputHeight = dCdH.shape[2]
+        outputWidth = dCdH.shape[3]
+        outputDur = dCdH.shape[4]
+        assert V.shape[0] == batchSize
+        inputFilters = V.shape[1]
+        inputHeight = V.shape[2]
+        inputWidth = V.shape[3]
+        inputDur = V.shape[4]
+        dr, dc, dt = d
+        dCdW = N.zeros(WShape, dtype=V.dtype)
+        #block
+        for j in xrange(0,WShape[0]):
+            for z in xrange(0,WShape[1]):
+                for k in xrange(0,WShape[2]):
+                    for l in xrange(0,WShape[3]):
+                        #threads
+                        for m in xrange(0,WShape[4]):
+                            #thread
+                            for i in xrange(0,batchSize):
+                                for p in xrange(0,outputHeight):
+                                    for q in xrange(0,outputWidth):
+                                        for r in xrange(0,outputDur):
+                                            dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
+        output_storage[0][0] = dCdW
+    def c_code(self, node, nodename, (V,d,WShape,dCdH), outputs, sub):
+        fail = sub['fail']
+        dCdW = outputs[0]
+        codeSource =  """          
+            ///////////// < code generated by GpuConvGrad3D >
+            //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
+            //Check dimensionality of inputs
+            if (%(dCdH)s->nd != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: dCdH must be a 5-d CudaNdArray");
+                %(fail)s
+            }
+            if (%(V)s->nd != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: V must be a 5-d CudaNdArray");
+                %(fail)s
+            }
+            if (%(WShape)s->nd != 1)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be a 1-d CudaNdArray");
+                %(fail)s
+            }
+            if (%(d)s->nd != 1)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: d must be a 1-d CudaNdArray");
+                %(fail)s
+            }
+            if (%(d)s->dimensions[0] != 3)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: 3 stride lengths arguments expected(for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
+                %(fail)s
+            }
+{ // for fail
+            //Read and check sizes of inputs
+            const int batchSize = CudaNdarray_HOST_DIMS(%(V)s)[0];
+            if (%(WShape)s->dimensions[0] != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must specify a 5-d shape");
+                %(fail)s
+            }
+            if (!PyArray_ISCONTIGUOUS(%(WShape)s))
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: WShape must be contiguous");
+                %(fail)s
+            }
+{ //for fail
+            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
+            const int outputChannels =  WShape[0];
+            const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
+            if (WShape[4] != inputChannels)
+            {
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels); 
+                %(fail)s
+            }
+{ //extra scope so fail works
+            const int filterHeight = WShape[1];
+            const int filterWidth = WShape[2];
+            const int filterDur = WShape[3];
+            const int vidHeight = CudaNdarray_HOST_DIMS(%(V)s)[1];
+            const int vidWidth = CudaNdarray_HOST_DIMS(%(V)s)[2];
+            const int vidDur = CudaNdarray_HOST_DIMS(%(V)s)[3];
+            if (vidHeight < filterHeight)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
+                %(fail)s
+            }
+            if (vidWidth < filterWidth)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a width of %%i but V is only %%i pixels wide", filterWidth, vidWidth);
+                %(fail)s
+            }
+            if (vidDur < filterDur)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            //Read and check stride arguments
+            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
+            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
+            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
+            if (dr <= 0 || dc <= 0 || dt <= 0)
+            {
+                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
+                %(fail)s
+            }
+            //Compute correctl sized of output
+            const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
+            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
+            const int outputDur = int( (vidDur - filterDur) / dt ) +1;
+            if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize || 
+                CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
+                CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
+                CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
+                CudaNdarray_HOST_DIMS(%(dCdH)s)[3] != outputDur)
+            {
+                PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%i,%%i,%%i,%%i,%%i)", batchSize, outputHeight, outputWidth, outputDur, outputChannels, CudaNdarray_HOST_DIMS(%(dCdH)s)[0], CudaNdarray_HOST_DIMS(%(dCdH)s)[1], CudaNdarray_HOST_DIMS(%(dCdH)s)[2] ,CudaNdarray_HOST_DIMS(%(dCdH)s)[3], CudaNdarray_HOST_DIMS(%(dCdH)s)[4] );
+                %(fail)s
+            }
+{ // extra scope for fail
+            npy_intp dims[5];
+            dims[0] = outputChannels;
+            dims[4] = inputChannels;
+            dims[1] = filterHeight;
+            dims[2] = filterWidth;
+            dims[3] = filterDur;
+            if(!(%(dCdW)s)  || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] || 
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] || 
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] || 
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] || 
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
+               Py_XDECREF(%(dCdW)s);
+               %(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
+               if (!(%(dCdW)s)) {
+                PyErr_Format(PyExc_MemoryError, "GpuConvGrad3D: Could not allocated dCdW");
+                %(fail)s
+               }
+            }
+{ //for fail
+            const int dcdhs4 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[4];
+            const int dcdhs3 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[3];
+            const int dcdhs1 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[1];
+            const int dcdhs2 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[2];
+            const int dcdhs0 = CudaNdarray_HOST_STRIDES(%(dCdH)s)[0];
+            const int vs4 = CudaNdarray_HOST_STRIDES(%(V)s)[4];
+            const int vs3 = CudaNdarray_HOST_STRIDES(%(V)s)[3];
+            const int vs2 = CudaNdarray_HOST_STRIDES(%(V)s)[2];
+            const int vs1 = CudaNdarray_HOST_STRIDES(%(V)s)[1];
+            const int vs0 = CudaNdarray_HOST_STRIDES(%(V)s)[0];
+bool out_contiguous = CudaNdarray_is_c_contiguous(%(dCdW)s);
+int version = -1;
+int verbose = 0;
+bool subsample =(dr>1)||(dc>1)||(dt>1);
+bool work_complete = false;
+if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_complete){
+    //conv_rows_stack
+    dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
+    dim3 threads(WShape[3]);
+    int shared_size=0;
+        convgrad_rows_stack<<<grid, threads, shared_size>>>(
+        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(dCdH)s), CudaNdarray_DEV_DATA(%(dCdW)s),
+        vidHeight, vidWidth, vidDur,
+        filterHeight, filterWidth, filterDur,
+        WShape[0], WShape[1], WShape[2], WShape[3], WShape[4],
+        outputHeight,outputWidth,outputDur,
+        batchSize, outputChannels, inputChannels,
+        dr,dc,dt,
+        vs3,vs2,vs1,vs4,vs0,
+        dcdhs3,dcdhs2,dcdhs1,dcdhs4,dcdhs0);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+	    if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: used 'conv_rows_stack' version\\n");
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("ERROR: all implementations failed for GpuConv3D! (%%s)",cudaGetErrorString(sts));
+            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
+                    cudaGetErrorString(sts));
+            %(fail)s
+        }      
+}
+if(!work_complete){
+            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
+            %(fail)s
+}
+}}}}} // extra scope for fail            
+            ///////////// < /code generated by GpuConvGrad3D >
+        """ 
+        return strutls.render_string(codeSource,locals()) 
+    def c_support_code_apply(self, node, nodename):
+        # This code is not sensitive to the ignore_border flag.
+        # It runs for every position in the output z, and then computes the gradient for the
+        # input pixels that were downsampled to that z-position.
+        codeSource =  """
+__global__ void
+//thread block size = WShape[4]
+//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
+//
+convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
+		 int img_len, int img_wid, int img_dur,
+                 int dCdW_len, int dCdW_wid, int dCdW_dur,
+                 int wsh0, int wsh1, int wsh2, int wsh3, int wsh4,
+                 int out_len, int out_wid, int out_dur,
+		 int batchSize, int nkern, int nstack,
+                 int dr, int dc, int dt,
+                 int img_stride_frame, int img_stride_col, int img_stride_row,
+		 int img_stride_stack, int img_stride_batch,
+                 int dCdW_stride_frame, int dCdW_stride_col, int dCdW_stride_row,
+		 int dCdW_stride_stack, int dCdW_stride_nkern)
+{
+  int __shared__ kern_id, stack_id;
+  float  __shared__ *d_img, *d_kern;
+  kern_id= blockIdx.x%nkern;
+  stack_id = blockIdx.x/nkern;
+  const int dCdW_row = blockIdx.y%ws1;
+  const int dCdW_col = blockIdx.y/ws1;
+  const int dCdW_frame=threadIdx.x;
+  img +=stack_id*img_stride_stack;
+  dCdH +=kern_id*dCdW_stride_stack;
+  float sum = 0.0f;
+  for(int i=0;i<batchSize;i++){
+      for(int p=0;p<out_len;p++){
+          for(int q=0;q<out_wid;q++){
+              for(int r=0;r<out_dur;r++){
+                  sum += dCdH[i*dCdW_stride_nkern+p*dCdW_stride_row+q*dCdW_stride_col+r*dCdW_stride_frame] *
+                         img[i*img_stride_batch+(dr*p+dCdW_row)*img_stride_row+(dc*q+dCdW_col)*img_stride_col+(dt*r+dCdW_frame)*img_stride_frame];
+              }
+          }
+      }
+  }
+  dCdW[kern_id*wsh1*wsh2*wsh3*wsh4+//the good batch
+      stack_id+//the output image
+      dCdW_row*wsh2*wsh3*wsh4+//the output row
+      dCdW_col*wsh3*wsh4 + //the output_col
+      dCdW_frame*wsh4] = sum;
+}
+/*
+        #block
+        for j in xrange(0,WShape[0]):
+            for z in xrange(0,WShape[1]):
+                for k in xrange(0,WShape[2]):
+                    for l in xrange(0,WShape[3]):
+                        #threads
+                        for m in xrange(0,WShape[4]):
+                            #thread
+                            for i in xrange(0,batchSize):
+                                for p in xrange(0,outputHeight):
+                                    for q in xrange(0,outputWidth):
+                                        for r in xrange(0,outputDur):
+                                            dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
+*/
+"""
+        return codeSource#renderString(codeSource,locals()) 
+gpu_conv_grad3d = GpuConvGrad3D()
+@theano.sandbox.cuda.opt.register_opt()
+@theano.gof.opt.local_optimizer([])
+def local_gpu_conv_gradd(node):
+    if isinstance(node.op, ConvGrad3D):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
+            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
+                V, d, WShape, dCdH = node.inputs
+                return [host_from_gpu(gpu_conv_grad3d(as_cuda_ndarray_variable(V),d, WShape, as_cuda_ndarray_variable(dCdH)))]
--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
+import numpy as N
+import theano.tensor as T
+from util import strutil
+import theano
+from .ConvTransp3D import ConvTransp3D
+from theano.sandbox.cuda import cuda_available, cuda_enabled
+from theano.sandbox.cuda.basic_ops import *
+if cuda_available:
+    from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
+class GpuConvTransp3D(theano.Op):
+    """ The gpu version of ConvTransp3D """
+    def __eq__(self,other):
+	    return type(self) == type(other)
+    def __hash__(self):
+	    return hash(type(self))
+    def make_node(self, W, b, d, H, RShape = None):
+        W_ = as_cuda_ndarray_variable(W)
+        b_ = as_cuda_ndarray_variable(b)
+        d_ = T.as_tensor_variable(d)
+        H_ = as_cuda_ndarray_variable(H)
+        if RShape:
+            RShape_ = T.as_tensor_variable(RShape)
+        else:
+            RShape_ = T.as_tensor_variable([-1,-1,-1])
+        return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
+                            outputs = [CudaNdarrayType(dtype=H_.dtype, 
+                                                       broadcastable=(False,)*5)()])
+    def infer_shape(self, node, input_shapes):
+        W,b,d,H,RShape = node.inputs
+        W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
+        return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
+    def perform_(self, node, inputs, output_storage):
+		W, b, d, H, RShape = inputs
+		print "\t\t\t\tGpuConvTransp3D python code still uses old format"
+		output_storage[0][0] = computeR(W,b,d,H,RShape)
+    def c_code_cache_version(self):
+        return ()
+    def c_code(self, node, nodename, (W, b, d, H, RShape), outputs, sub):
+		fail = sub['fail']
+		R = outputs[0]
+		codeSource =  """          
+			///////////// < code generated by GpuConvTransp3D >
+			//printf("\t\t\t\tGpuConvTransp c code\\n");
+			//Check dimensionality of inputs
+			if (%(H)s->nd != 5)
+			{
+                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd); 
+                %(fail)s
+			}
+			if (%(W)s->nd != 5)
+			{
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor"); 
+                %(fail)s
+			}
+			if (%(b)s->nd != 1)
+			{
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector"); 
+                %(fail)s
+			}
+			if (%(d)s->nd != 1)
+			{
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector"); 
+                %(fail)s
+			}
+			//Read and check stride arguments
+			if (%(d)s->dimensions[0] != 3)
+			{
+				PyErr_Format(PyExc_ValueError,"GpuConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0]);
+				%(fail)s
+			}
+{ // for fail
+			const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
+			const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
+			const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
+			if (dr <= 0 || dc <= 0 || dt <= 0)
+			{
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
+			    %(fail)s
+			}
+			//Read and check sizes of inputs
+{ // for fail
+			const int batchSize = CudaNdarray_HOST_DIMS(%(H)s)[0];
+			const int outputChannels =  CudaNdarray_HOST_DIMS(%(W)s)[0];
+			if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
+			{
+                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]); 
+                %(fail)s
+			}
+{ // for fail
+			const int inputChannels = CudaNdarray_HOST_DIMS(%(W)s)[4];
+			if (CudaNdarray_HOST_DIMS(%(b)s)[0] != inputChannels)
+			{
+                PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%i channel image but the image has %%i channels", CudaNdarray_HOST_DIMS(%(b)s)[0], inputChannels );
+				%(fail)s
+			}
+{ // for fail
+			const int filterHeight = CudaNdarray_HOST_DIMS(%(W)s)[1];
+			const int filterWidth = CudaNdarray_HOST_DIMS(%(W)s)[2];
+			const int filterDur = CudaNdarray_HOST_DIMS(%(W)s)[3];
+			const int outputHeight = CudaNdarray_HOST_DIMS(%(H)s)[1];
+			const int outputWidth = CudaNdarray_HOST_DIMS(%(H)s)[2];
+			const int outputDur = CudaNdarray_HOST_DIMS(%(H)s)[3];
+			int videoHeight = (outputHeight-1) * dr + filterHeight;
+			int videoWidth = (outputWidth-1) * dc + filterWidth;
+			int videoDur = (outputDur-1) * dt + filterDur;
+			if (%(RShape)s)
+			{
+				if (%(RShape)s->nd != 1)
+				{
+					PyErr_Format(PyExc_ValueError, "RShape must be a vector");
+				    %(fail)s
+				}
+				if (%(RShape)s->dimensions[0] != 3)
+				{
+                    PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
+                    %(fail)s
+				}
+{ // for fail
+                                dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
+                                dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
+                                dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
+				if (RShape0 != -1)
+				{
+					if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
+					{
+						PyErr_Format(PyExc_ValueError, "Reconstruction must have shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]" , videoHeight, videoWidth, videoDur, RShape0, RShape 1, RShape2 );
+					    %(fail)s
+					}
+					videoHeight = RShape0;
+					videoWidth = RShape1;
+					videoDur = RShape2;
+				}
+			}
+			//Allocate the reconstruction
+			npy_intp dims[5];
+			dims[0] = batchSize;
+			dims[4] = inputChannels;
+			dims[1] = videoHeight;
+			dims[2] = videoWidth;
+			dims[3] = videoDur;
+                        if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] || 
+                        CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] || 
+                        CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] || 
+                        CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] || 
+                        CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
+		            Py_XDECREF(%(R)s);
+			   %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
+			    if (!(%(R)s)) {
+				    PyErr_Format(PyExc_MemoryError,"Could not allocate R");
+				    %(fail)s;
+			    }
+                        }
+			cudaMemset(%(R)s->devdata, 0, 4 * batchSize * inputChannels * videoHeight * videoWidth * videoDur);
+{ // for fail
+bool out_contiguous = CudaNdarray_is_c_contiguous(%(R)s);
+int version = -1;
+int verbose = 0;
+bool subsample =(dr>1)||(dc>1)||(dt>1);
+bool b_strided = (CudaNdarray_HOST_STRIDES(%(b)s)[0]!=1) && !(CudaNdarray_HOST_STRIDES(%(b)s)[0]==0 && outputChannels==1);
+printf("b stride0=%%d\\n",CudaNdarray_HOST_STRIDES(%(b)s)[0]);
+bool work_complete = false;
+const int ws4 = CudaNdarray_HOST_STRIDES(%(W)s)[4];
+const int ws3 = CudaNdarray_HOST_STRIDES(%(W)s)[3];
+const int ws2 = CudaNdarray_HOST_STRIDES(%(W)s)[2];
+const int ws1 = CudaNdarray_HOST_STRIDES(%(W)s)[1];
+const int ws0 = CudaNdarray_HOST_STRIDES(%(W)s)[0];
+const int hs4 = CudaNdarray_HOST_STRIDES(%(H)s)[4];
+const int hs3 = CudaNdarray_HOST_STRIDES(%(H)s)[3];
+const int hs2 = CudaNdarray_HOST_STRIDES(%(H)s)[2];
+const int hs1 = CudaNdarray_HOST_STRIDES(%(H)s)[1];
+const int hs0 = CudaNdarray_HOST_STRIDES(%(H)s)[0];
+if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_complete){
+    //conv_transp_rows_stack
+    dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
+    dim3 threads(videoDur);
+HERE
+    int shared_size=0;
+        conv_transp_rows_stack<<<grid, threads, shared_size>>>(
+        CudaNdarray_DEV_DATA(%(H)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(R)s),
+        videoHeight, videoWidth, videoDur,
+        filterHeight, filterWidth, filterDur,
+        outputHeight, outputWidth, outputDur,
+        outputChannels, inputChannels,
+        dr,dc,dt,
+        hs3,hs2,hs1,hs4,hs0,
+        ws3,ws2,ws1,ws4,ws0,
+        CudaNdarray_HOST_STRIDES(%(b)s)[0]);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+	    if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: used 'conv_transp_rows_stack' version\\n");
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("ERROR: all implementations failed for GpuConvTransp3D! (%%s)",cudaGetErrorString(sts));
+            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
+                    cudaGetErrorString(sts));
+            %(fail)s
+        }      
+}
+if(!work_complete){
+            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConvTransp3D! out_contiguous=%%d b_strided=%%d outputDur=%%d",
+                         out_contiguous,b_strided,outputDur);
+            %(fail)s
+}
+}}}}}} // for fail
+			///////////// < /code generated by GpuConvTransp3D >
+		""" 
+		return renderString(codeSource,locals()) 
+    def c_support_code_apply(self, node, nodename):
+        # This code is not sensitive to the ignore_border flag.
+        # It runs for every position in the output z, and then computes the gradient for the
+        # input pixels that were downsampled to that z-position.
+        codeSource =  """
+__global__ void
+//thread block size = videoDur
+//grid block size =(batchSize * inputChannels, videoHeight * videoWidth)
+//
+conv_transp_rows_stack( float* H, float* kern, float* bias, float* R,
+		 int img_len, int img_wid, int img_dur,
+                 int kern_len, int kern_wid, int kern_dur,
+                 int H_len, int H_wid, int H_dur,
+		 int nkern, int nstack,
+                 int dr, int dc, int dt,
+                 int H_stride_frame, int H_stride_col, int H_stride_row,
+		 int H_stride_stack, int H_stride_batch,
+                 int kern_stride_frame, int kern_stride_col, int kern_stride_row,
+		 int kern_stride_stack, int kern_stride_nkern,
+                 int bias_stride)
+{
+    int __shared__ batch_id, stack_id;
+    float  __shared__ *d_img, *d_kern;
+    batch_id= blockIdx.x/nstack;
+    stack_id = blockIdx.x - batch_id*nstack;
+    const int R_row = blockIdx.y/img_wid;
+    const int R_col = blockIdx.y%img_wid;
+    const int R_frame=threadIdx.x;
+    const int r = R_row;
+    const int c = R_col;
+    const int t = R_frame;
+    const int ftc = max(0, int(ceil(float(t-kern_dur +1  )/float(dt))));
+    const int fcc = max(0, int(ceil(float(c-kern_wid +1)/float(dc))));
+    int rc =  max(0, int(ceil(float(r-kern_len+1)/float(dr))));
+    float sum = 0;
+    while(rc < H_len){
+	int rk = r - rc * dr;
+    	if(rk < 0)
+    		break;
+    	int cc = fcc;
+    	while( cc < H_wid){
+    		int ck = c - cc * dc;
+    		if(ck < 0)
+    			break;
+    		int tc = ftc;
+    		while(tc < H_dur){
+    			int tk = t - tc * dt;
+    			if(tk < 0)
+    				break;
+    			//R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
+                        for(int q=0;q<nkern;q++){
+                          sum += kern[q*kern_stride_nkern+stack_id*kern_stride_stack+rk*kern_stride_row+ck*kern_stride_col+tk*kern_stride_frame]*
+                                 H[batch_id*H_stride_batch+q*H_stride_stack+rc*H_stride_row+cc*H_stride_col+tc*H_stride_frame];
+                        }
+    			tc += 1;
+                }
+    		cc += 1;
+        }
+    	rc += 1;
+    }
+    R[batch_id*nstack*img_len*img_wid*img_dur+//the good batch
+      stack_id+//the output image
+      R_row*img_wid*img_dur*nstack+//the output row
+      R_col*img_dur*nstack + //the output_col
+      R_frame*nstack] = sum + bias[stack_id*bias_stride];
+}
+"""
+        return codeSource
+gpu_conv_transpd = GpuConvTransp3D()
+@theano.sandbox.cuda.opt.register_opt()
+@theano.gof.opt.local_optimizer([])
+def local_gpu_conv_transpd(node):
+    if isinstance(node.op, ConvTransp3D):
+        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu) for i in node.inputs]):
+            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
+                W, b, d, H, RShape = node.inputs
+                return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
+#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
+def computeR(W,b,d,H,Rshape = None):
+		assert len(W.shape) == 5
+		assert len(H.shape) == 5
+		assert len(b.shape) == 1
+		assert len(d) == 3		
+		outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
+		batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
+		assert outputChannelsAgain == outputChannels
+		assert b.shape[0] == inputChannels
+		dr,dc,dt = d
+		assert dr > 0
+		assert dc > 0
+		assert dt > 0
+		videoHeight = (outputHeight-1) * dr + filterHeight
+		videoWidth = (outputWidth-1) * dc + filterWidth
+		videoDur = (outputDur-1) * dt + filterDur
+		if Rshape != None and Rshape[0] != -1:
+			if Rshape[0] < videoHeight:
+				print (Rshape[0], videoHeight)
+				assert False
+			assert Rshape[1] >= videoWidth
+			assert Rshape[2] >= videoDur
+			#print "setting video size to Rshape = "+str(Rshape)	
+			videoHeight, videoWidth, videoDur = Rshape
+		#else:
+		#	print "No Rshape passed in"
+		#print "video size: "+str((videoHeight, videoWidth, videoDur))	
+		R =  N.zeros( (batchSize, inputChannels, videoHeight,
+			videoWidth, videoDur ) , dtype=H.dtype)
+		#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] 
+		for i in xrange(0,batchSize):
+			#print '\texample '+str(i+1)+'/'+str(batchSize)
+			for j in xrange(0,inputChannels):
+				#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
+				for r in xrange(0,videoHeight):
+					#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
+					for c in xrange(0,videoWidth):
+						for t in xrange(0,videoDur):
+							R[i,j,r,c,t] = b[j]
+							ftc = max([0, int(N.ceil(float(t-filterDur +1  )/float(dt))) ])
+							fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
+							rc =  max([0, int(N.ceil(float(r-filterHeight+1)/float(dr))) ])
+							while rc < outputHeight:
+								rk = r - rc * dr
+								if rk < 0:
+									break
+								cc = fcc
+								while cc < outputWidth:
+									ck = c - cc * dc
+									if ck < 0:
+										break
+									tc = ftc
+									while tc < outputDur:
+										tk = t - tc * dt
+										if tk < 0:
+											break
+										R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )
+										tc += 1
+									"" #close loop over tc
+									cc += 1
+								"" #close loop over cc
+								rc += 1
+							"" #close loop over rc
+						"" #close loop over t
+					"" #close loop over c
+				"" #close loop over r
+			"" #close loop over j
+		"" #close loop over i
+		return R
+from ops.Conv3D import *
+from ops.ConvGrad3D import *
--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
+import theano
+from theano.tensor import basic as T
+import numpy as N
+from theano.sandbox.cuda import cuda_available, cuda_enabled
+from util import strutil
+from theano import printing
+from theano.tensor.blas_headers import blas_header_text
+from theano.tensor.blas import ldflags
+#Note: not a true convolution because we don't bother with flipping the kernel
+#An op that takes a weight tensor W. a bias vector b, and a visible tensor V, produces a hidden unit tensor H
+#Also parmeterized by integer strides dr,dc,dt
+#H[i,r,c,t,j] = video i within the minibatch, feature map j, location and time within feature map (r,c,t)
+#W[j,k,l,m,z] = weights connecting H[i,r,c,t,j] to V[i,dr*r+k,dc*c+l,dt*t+m,z]
+#b[j] = bias of feature map j
+#V[i,r,c,t,j] = pixel at (r,c,t) within video featuremap j of video i within the minibatch
+#i.e., H[i,j,r,c,t] = b_j + sum_k sum_l sum_m sum_z W[j,k,l,m,z] V[i,z, dr*r+k,dc*c+l,dt*t+m]
+#The layouts of these variables are chosen to improve locality of reference.
+#numpy seems to put the largest stride on axis 0 and decrease the stride from there. If we do convolution
+#one filter at a time, one example at a time, then we want the largest strides to
+#be over the examples. We want the smallest stride to be over the input channel because as we change
+#the channel we re-visit the same location in the input.
+#The smallest stride being over the input channel means that the weights need to be formatted with the input
+#channel as the last index
+#partial C / partial b_j =  sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * ( partial H[i,r,c,t,k] / partial b_j )
+# =  sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] )  * delta(k = j)
+# =  sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] ) 
+#partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * (partial H[i,p,q,r,n] / partial W[j,k,l,m,z])
+# = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * 
+# (partial sum_s sum_u sum_v sum_a  W[n,a, s,u,v] V[i, dr*p+s,dc*q+u,dt*r+v, a] ) / partial W[j,k,l,m,z])
+# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) * 
+# (partial sum_s sum_u sum_v sum_a W[j,a, s,u,v] V[i,dr*p+s,dc*q+u,dt*r+v,a] ) / partial W[j,k,l,m,z])
+# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *  V[i,dr*p+k,dc*q+l,dt*r+m,z] 
+#derivatives wrt V unimplemented for now. derivatives wrt dr, dc, dt are undefined since dr, dc, dt are natural numbers.
+class Conv3D(theano.Op):
+    """ 3D "convolution" of multiple filters on a minibatch (does not flip the kernel, moves kernel with a user specified stride) """
+    def __eq__(self,other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def __str__(self):
+        return "Conv3D"
+    def make_node(self, V, W, b, d):
+        """
+            :param V: Visible unit, input
+            :param W: Weights, filter
+            :param b: bias, shape == (W.shape[0],)
+            :param d: strides when moving the filter over the input
+        """
+        V_ = T.as_tensor_variable(V)
+        W_ = T.as_tensor_variable(W)
+        b_ = T.as_tensor_variable(b)
+        d_ = T.as_tensor_variable(d)
+        node = theano.Apply(self, inputs=[V_, W_,b_,d_], outputs = [ T.TensorType(V_.dtype, (V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )
+        return node
+    def grad(self,inputs, output_gradients):
+        V,W,b,d = inputs
+        dCdH ,= output_gradients
+        #make all of these ops support broadcasting of scalar b to vector b and eplace the zeros_like in all their grads
+        #print dCdH.broadcastable
+        #print "dCdH.broadcastable"
+        #quit(-1)
+        #dCdH = printing.Print("dCdH = ",["shape"])
+        dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4] )
+        WShape = W.shape
+        dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH)
+        dCdb = T.sum(dCdH, axis=(0,1,2,3))
+        dCdd = None #not differentiable, since d is not continuous
+        return [ dCdV, dCdW, dCdb, dCdd ]
+    def perform(self, node, inputs, output_storage):
+        V, W, b, d = inputs
+        print "Conv3D python code"
+        output_storage[0][0] = computeH(V,W,b,d)
+    def infer_shape(self, node, input_shapes):
+        V,W,b,d = node.inputs
+        V_shape, W_shape, b_shape, d_shape = input_shapes
+        dr = d[0]
+        dc = d[1]
+        dt = d[2]
+        batch_size = V_shape[0]
+        output_channels = W_shape[0]
+        vidHeight = V_shape[1]
+        filterHeight = W_shape[1]
+        vidWidth = V_shape[2]
+        filterWidth = W_shape[2]
+        vidDur = V_shape[3]
+        filterDur = W_shape[3]        
+        output_height = T.floor( (vidHeight - filterHeight) / dr )+1
+        output_width = T.floor( (vidWidth - filterWidth) / dc )+1
+        output_dur = T.floor( (vidDur - filterDur) / dt ) +1
+        return [(batch_size,  output_height, output_width, output_dur, output_channels )]
+    def c_support_code(self):
+        return blas_header_text()
+    def c_libraries(self):
+        return ldflags()
+    def c_compile_args(self):
+        flags =  ldflags(libs=False, flags=True)
+        flags.append('-Werror')
+        return flags
+    def c_lib_dirs(self):
+        return ldflags(libs=False, libs_dir=True)
+    def c_header_dirs(self):
+        return ldflags(libs=False, include_dir=True)
+    def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
+        fail = sub['fail']
+        H = outputs[0]
+        codeSource =  """          
+            ///////////// < code generated by Conv3D >
+            //printf("\t\t\t\tConv3D c code\\n");
+            //Check dimensionality of inputs
+            if (%(W)s->nd != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor"); 
+                            %(fail)s
+            }
+            if (%(V)s->nd != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor"); 
+                            %(fail)s
+            }
+            if (%(b)s->nd != 1)
+            {
+                PyErr_Format(PyExc_ValueError,"Conv3D: b must be a vector.");
+                %(fail)s
+            }
+            if (%(d)s->nd != 1)
+            {
+                PyErr_Format(PyExc_ValueError,"Conv3D: d must be a vector.");
+                %(fail)s
+            }
+            if (%(d)s->dimensions[0] != 3)
+            {
+                PyErr_Format(PyExc_ValueError,"Conv3D: 3 stride length arguments expected (row, col, time) but %%li were given",%(d)s->dimensions[0]);
+                %(fail)s
+            }
+            //Read and check sizes of inputs
+{ // exta scope so error handler jumps don't cause errors
+            const int batchSize = %(V)s->dimensions[0];
+            const int outputChannels =  %(W)s->dimensions[0];
+            const int inputChannels = %(V)s->dimensions[4];
+            if (%(W)s->dimensions[4] != inputChannels)
+            {
+                PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%li channel image but the image has %%i channels.",%(W)s->dimensions[4],inputChannels); 
+                %(fail)s
+            }
+            if (%(b)s->dimensions[0] != outputChannels)
+            {
+                PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%li channel output image but the output has %%i channels",%(b)s->dimensions[0],outputChannels); 
+                %(fail)s
+            }
+{  //extra scope so error handler jumps don't cause errors
+            const int filterHeight = %(W)s->dimensions[1];
+            const int filterWidth = %(W)s->dimensions[2];
+            const int filterDur = %(W)s->dimensions[3];
+            const int vidHeight = %(V)s->dimensions[1];
+            const int vidWidth = %(V)s->dimensions[2];
+            const int vidDur = %(V)s->dimensions[3];\
+            if (vidHeight < filterHeight)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a height of %%i but V is only %%i pixels tall",filterHeight,vidHeight);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            if (vidWidth < filterWidth)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a width of %%i but V is only %%i pixels wide",filterWidth,vidWidth);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            if (vidDur < filterDur)
+            {
+                PyErr_Format(PyExc_ValueError, "W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            //Read and check stride arguments
+            const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
+            const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
+            const int dt = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,2);
+            if (dr <= 0 || dc <= 0 || dt <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,"Conv3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            //Make correctly sized output
+            const long long outputHeight = int( (vidHeight - filterHeight) / dr )+1;
+            const long long outputWidth = int( (vidWidth - filterWidth) / dc )+1;
+            const long long outputDur = int( (vidDur - filterDur) / dt ) +1;
+            npy_intp dims[5];
+            dims[0] = batchSize;
+            dims[4] = outputChannels;
+            dims[1] = outputHeight;
+            dims[2] = outputWidth;
+            dims[3] = outputDur;
+            if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] || 
+            %(H)s->dimensions[1]!=dims[1] || 
+            %(H)s->dimensions[2]!=dims[2] || 
+            %(H)s->dimensions[3]!=dims[3] || 
+            %(H)s->dimensions[4]!=dims[4]){
+                Py_XDECREF(%(H)s);
+                %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
+                if (!(%(H)s)) {
+                    PyErr_Format(PyExc_MemoryError,"Conv3D: Could not allocate output.");
+                    %(fail)s
+                }
+            }
+{ // extra scope so fail works
+            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
+            const int ws0 = %(W)s->strides[0];
+            const int ws1 = %(W)s->strides[1];
+            const int ws2 = %(W)s->strides[2];
+            const int vs1 = %(V)s->strides[1];
+            const int ws4 = %(W)s->strides[4];
+            const int vs4 = %(V)s->strides[4];
+            const int ws3 = %(W)s->strides[3];
+            const int vs3 = %(V)s->strides[3];
+            const int vs2 = %(V)s->strides[2];
+            const int bs  = %(b)s->strides[0];
+            const int hs4 = %(H)s->strides[4];
+            // Compute H
+            //H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dr*r+k,dc*c+l,dt*t+m]
+            //TODO: add special cases
+            // ex: filterDur == 1 && batchSize == 1 && dt = 1  (for SFA)
+            // ex: inputChannels == 1 """
+        #if the data types are not mixed, we can insert special case optimizations based on BLAS
+        VV, WV, bv, dv = node.inputs
+        HV = node.outputs[0]
+        if VV.dtype == WV.dtype and HV.dtype == VV.dtype:
+            if VV.dtype == 'float64':
+                gemv = 'dgemv_'
+            elif VV.dtype == 'float32':
+                gemv = 'sgemv_'
+            else:
+                raise Exception('Unrecognized dtype for convolution '+V.value.dtype)
+            codeSource += """
+            if (inputChannels > 20 && outputChannels > 20 && ws4 == sizeof(ELEM_AT(%(W)s,0)))
+            {
+              std::cout << "lots of channels special case code" << std::endl;
+              #define blas_type dtype_ ## %(V)s
+              const blas_type  constant_one = 1.0;
+              char N = 'T';
+              int ws0e = ws0 / sizeof(ELEM_AT(%(W)s,0));
+              int vs4e = vs4 / sizeof(ELEM_AT(%(V)s,4));
+              int hs4e = hs4 / sizeof(ELEM_AT(%(H)s,4));
+                //special case code for the "lots of channels" case
+                //uses a BLAS matrix vector multiply to compute the contribute for
+                //all channels of an input pixel to all channels of an output pixel
+                //simultaneously
+              long long Hpos = 0;
+              long long Vpos = 0;
+              for (int i = 0; i < batchSize; i++) {
+                    long long Hposi = Hpos;
+                    long long Vposi = Vpos;
+                    for (int r = 0;  r < outputHeight; r++) {
+                      long long Hposr = Hpos;
+                      long long Vposr = Vpos;
+                      for (int c = 0; c < outputWidth; c++) {
+                       long long Hposc = Hpos;
+                       long long Vposc = Vpos;
+                       for (int t = 0; t < outputDur; t++) {
+                            long long Hpost = Hpos;
+                            long long Vpost = Vpos;
+                            //of the loops so far, j should be the innermost, because
+                            //each loop through j visits the same elements of V
+                            //this implies that the last index of H should be the j index
+                            //since V and H should have the same format, this means
+                            //z should be the last index in v, and therefore the innermost
+                            //of the next set of for loops
+                            int Wpos = 0;
+                            int bPos = 0;
+                            long long Hposj = Hpos;
+                            for (int j = 0; j < outputChannels; j++) {
+                                // H[i,r,c,t,j] = b[j]
+                                ELEM_AT(%(H)s,Hposj) = ELEM_AT(%(b)s,bPos);
+                                Hposj += hs4;
+                                bPos += bs;
+                            }
+                            dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);
+                            for (int k =0; k < filterHeight; k++) {
+                                  int Wposk = Wpos;
+                                  long long Vposk = Vpos;
+                                  for (int l = 0; l < filterWidth; l++) {
+                                    int Wposl = Wpos;
+                                    long long Vposl = Vpos;
+                                    for (int m = 0; m < filterDur; m++) {
+                                      //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])
+                                      //note: changing the weights so that outputChannels and inputChannels were the last two rather than
+                                      //the first and last elements did not speed this up, even for extremely large input sizes
+                                      %(gemv)s(&N, & inputChannels, & outputChannels,
+                     &constant_one, & ELEM_AT( %(W)s , Wpos),& ws0e,
+                     & ELEM_AT(%(V)s, Vpos),& vs4e, &constant_one,
+                     writePos,& hs4e);
+                                      Wpos  += ws3;
+                                      Vpos  += vs3;
+                                    } // close m
+                                    Wpos = Wposl + ws2;
+                                    Vpos = Vposl + vs2;
+                                  } //close l
+                                  Wpos = Wposk + %(W)s->strides[1];
+                                  Vpos = Vposk + %(V)s->strides[1];
+                                } //close k
+                             Hpos = Hpost + %(H)s->strides[3];
+                             Vpos = Vpost + vs3 * dt;
+                         } //close t
+                         Hpos = Hposc + %(H)s->strides[2];
+                         Vpos = Vposc + vs2 * dc;
+                       } //close c
+                       Hpos = Hposr + %(H)s->strides[1];
+                       Vpos = Vposr + %(V)s->strides[1] * dr;
+                   } //closes r
+                   Hpos = Hposi + %(H)s->strides[0]; 
+                   Vpos = Vposi + %(V)s->strides[0];
+              } //closes i
+            } //closes "lots of channels" special case code
+            else
+"""
+        codeSource += """
+            {
+              //General case code
+              //std::cout << "general case code" << std::endl;
+              long long Hpos = 0;
+              long long Vpos = 0;
+              for (int i = 0; i < batchSize; i++) {
+                    long long Hposi = Hpos;
+                    long long Vposi = Vpos;
+                    for (int r = 0;  r < outputHeight; r++) {
+                      long long Hposr = Hpos;
+                      long long Vposr = Vpos;
+                      for (int c = 0; c < outputWidth; c++) {
+                       long long Hposc = Hpos;
+                       long long Vposc = Vpos;
+                       for (int t = 0; t < outputDur; t++) {
+                            long long Hpost = Hpos;
+                            long long Vpost = Vpos;
+                            //of the loops so far, j should be the innermost, because
+                            //each loop through j visits the same elements of V
+                            //this implies that the last index of H should be the j index
+                            //since V and H should have the same format, this means
+                            //z should be the last index in v, and therefore the innermost
+                            //of the next set of for loops
+                            int Wpos = 0;
+                            int bPos = 0;
+                            for (int j = 0; j < outputChannels; j++) {
+                                long long Hposj = Hpos;
+                                long long Vposj = Vpos;
+                                int Wposj = Wpos;
+                                // H[i,r,c,t,j] = b[j]
+                                dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);
+                                writePos = ELEM_AT(%(b)s,bPos);
+                                for (int k =0; k < filterHeight; k++) {
+                                  int Wposk = Wpos;
+                                  long long Vposk = Vpos;
+                                  for (int l = 0; l < filterWidth; l++) {
+                                    int Wposl = Wpos;
+                                    long long Vposl = Vpos;
+                                    for (int m = 0; m < filterDur; m++) {
+                                      int Wposm = Wpos;
+                                      long long Vposm = Vpos;
+                                      for (int z = 0; z < inputChannels; z++) {
+                                        //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]
+                                        writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);
+                                        Wpos += ws4;
+                                        Vpos += vs4;
+                                      } // close z
+                                      Wpos = Wposm + ws3;
+                                      Vpos = Vposm + vs3;
+                                    } // close m
+                                    Wpos = Wposl + ws2;
+                                    Vpos = Vposl + vs2;
+                                  } //close l
+                                  Wpos = Wposk + %(W)s->strides[1];
+                                  Vpos = Vposk + %(V)s->strides[1];
+                                } //close k
+                              bPos += bs;
+                              Wpos = Wposj + ws0;
+                              Hpos = Hposj +  hs4;
+                              Vpos = Vposj;
+                              //std::cout << "incremented Wpos by " << ws0 << std::endl;
+                              //std::cout << "incremented Hpos by " << hs4 << std::endl;
+                             } //close j
+                             Hpos = Hpost + %(H)s->strides[3];
+                             Vpos = Vpost + vs3 * dt;
+                         } //close t
+                         Hpos = Hposc + %(H)s->strides[2];
+                         Vpos = Vposc + vs2 * dc;
+                       } //close c
+                       Hpos = Hposr + %(H)s->strides[1];
+                       Vpos = Vposr + %(V)s->strides[1] * dr;
+                   } //closes r
+                   Hpos = Hposi + %(H)s->strides[0]; 
+                   Vpos = Vposi + %(V)s->strides[0];
+              } //closes i
+            } //closes general case code
+}}}}}}} //extra scope so error handler jumps don't cross declarations  
+            ///////////// < /code generated by Conv3D >
+        """ 
+        return strutil.renderString(codeSource,locals()) 
+global conv3D
+conv3D = Conv3D()
+def computeH(V,W,b,d):
+        assert len(W.shape) == 5
+        assert len(V.shape) == 5
+        if len(b.shape) != 1:
+            print b.shape
+            assert False
+        assert len(d) == 3        
+        batchSize = V.shape[0]
+        outputChannels = W.shape[0]
+        inputChannels = V.shape[4]
+        if W.shape[4] != inputChannels:
+            raise Exception("W.shape[4] = "+str(W.shape[4])+" but inputChannels = "+str(inputChannels))
+        filterHeight = W.shape[1]
+        filterWidth = W.shape[2]
+        filterDur = W.shape[3]
+        vidHeight = V.shape[1]
+        vidWidth = V.shape[2]
+        vidDur = V.shape[3]
+        assert vidHeight >= filterHeight
+        assert vidWidth >= filterWidth
+        assert vidDur >= filterDur
+        dx,dy,dt = d
+        assert dx > 0
+        assert dy > 0
+        assert dt > 0
+        outputHeight = int( (vidHeight - filterHeight) / dx )+1
+        outputWidth = int( (vidWidth - filterWidth) / dy )+1
+        outputDur = int( (vidDur - filterDur) / dt ) +1
+        H =  N.zeros( (batchSize,  outputHeight,
+            outputWidth, outputDur, outputChannels ), dtype=V.dtype )
+        #H[i,j,x,y,t] = b_j + sum_k sum_l sum_m sum_z W[j,z,k,l,m] V[i,z, dx*x+k,dy*y+l,dt*t+m]
+        for i in xrange(0,H.shape[0]):
+            #print '\texample '+str(i+1)+'/'+str(H.shape[0])
+            for j in xrange(0,H.shape[4]):
+                #print '\t\tfeature map '+str(j+1)+'/'+str(H.shape[1])
+                for x in xrange(0,H.shape[1]):
+                    #print '\t\t\trow '+str(x+1)+'/'+str(H.shape[2])
+                    for y in xrange(0,H.shape[2]):
+                        for t in xrange(0,H.shape[3]):
+                            H[i,x,y,t,j] = b[j]
+                            for k in xrange(0,filterHeight):
+                                for l in xrange(0,filterWidth):
+                                    for m in xrange(0,filterDur):
+                                        for z in xrange(0,inputChannels):
+                                            #if (i,j,x,y,t) == (0,0,0,0,0):
+                                            #    print (( W[j,z,k,l,m] , V[i,z,d[0]*x+k,d[1]*y+l,d[2]*t+m] ), (k,l,m) )
+                                            w = W[j,k,l,m,z]
+                                            v = V[i,d[0]*x+k, d[1]*y+l, d[2]*t+m,z]
+                                            #if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
+                                            #    print 'setting H[0] += '+str(w*v)+'   W['+str((j,z,k,l,m))+']='+str(w)+'   V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
+                                            H[i,x,y,t,j] += w * v  
+        return H
+from . import ConvGrad3D
+from . import ConvTransp3D
+if cuda_available:
+    from . import GpuConv3D
--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
+import theano
+from theano.tensor import basic as T
+from theano.sandbox.cuda import cuda_available, cuda_enabled
+from util import strutil
+import numpy as N
+class ConvGrad3D(theano.Op):
+    """ Gradient of Conv3D with respect to W """
+    def __eq__(self,other):
+        return type(self) == type(other)
+    def __hash__(self):
+        return hash(type(self))
+    def make_node(self, V, d, WShape, dCdH):
+        V_ = T.as_tensor_variable(V)
+        d_ = T.as_tensor_variable(d)
+        WShape_ = T.as_tensor_variable(WShape)
+        dCdH_ = T.as_tensor_variable(dCdH)
+        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], outputs = [ T.TensorType(V_.dtype, (False,False,False,False,False))() ] )
+    def infer_shape(self, node, input_shapes):
+        V,d,W_shape, dCdH = node.inputs
+        return [ ( W_shape[0], W_shape[1], W_shape[2], W_shape[3], W_shape[4] ) ]
+    def grad(self,inputs, output_gradients):
+        C,d, WShape, B = inputs
+        dLdA ,= output_gradients
+        z = T.zeros_like(C[0,0,0,0,:])
+        dLdC = convTransp3D( dLdA, z, d, B, C.shape[1:4])
+        dLdd = None #not differentiable, since d is not continuous
+        dLdWShape = None #not differentiable, since d is not continuous
+        dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d)
+        return [ dLdC, dLdd, dLdWShape, dLdB ]
+    def perform(self, node, inputs, output_storage):
+        V, d, WShape, dCdH = inputs
+        print "ConvGradW3D python code"
+        #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m] 
+        batchSize = dCdH.shape[0]
+        outputFilters = dCdH.shape[4]
+        outputHeight = dCdH.shape[1]
+        outputWidth = dCdH.shape[2]
+        outputDur = dCdH.shape[3]
+        assert V.shape[0] == batchSize
+        inputFilters = V.shape[4]
+        inputHeight = V.shape[1]
+        inputWidth = V.shape[2]
+        inputDur = V.shape[3]
+        dr, dc, dt = d
+        dCdW = N.zeros(WShape, dtype=V.dtype)
+        #print 'computing output of shape '+str(WShape)
+        for k in xrange(0,WShape[1]):
+            for l in xrange(0,WShape[2]):
+                for m in xrange(0,WShape[3]):
+                    for i in xrange(0,batchSize):
+                        for p in xrange(0,outputHeight):
+                            for q in xrange(0,outputWidth):
+                                for r in xrange(0,outputDur):
+                                    for j in xrange(0,WShape[0]):
+                                        for z in xrange(0,WShape[4]):
+                                            dCdW[j,k,l,m,z] +=  dCdH[i,p,q,r,j] * V[i,dr*p+k,dc*q+l,dt*r+m,z]
+        output_storage[0][0] = dCdW
+    def c_compile_args(self):
+        flags = ['-Werror']
+        return flags
+    def c_code(self, node, nodename, (V,d,WShape,dCdH), outputs, sub):
+        fail = sub['fail']
+        dCdW = outputs[0]
+        codeSource =  """          
+            ///////////// < code generated by ConvGradW3D >
+            //printf("\t\t\t\tConvGradW3D c code\\n");
+            //Check dimensionality of inputs
+            if (%(dCdH)s->nd != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor"); 
+                            %(fail)s
+            }
+            if (%(V)s->nd != 5)
+            {
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor"); 
+                %(fail)s
+            }
+            if (%(WShape)s->nd != 1)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be a vector.");
+                %(fail)s
+            }
+            if (%(d)s->nd != 1)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: d must be a vector.");
+                %(fail)s
+            }
+            if (%(d)s->dimensions[0] != 3)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: 3 stride length arguments expected (row, col, time) but %%li were given",%(d)s->dimensions[0]);
+                %(fail)s
+            }
+{ //extra scope so that fail will not jump over declarations
+            //Read and check sizes of inputs
+            const int batchSize = %(V)s->dimensions[0];
+            if (%(WShape)s->dimensions[0] != 5)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must specify a 5D shape");
+                %(fail)s
+            }
+            if (!PyArray_ISCONTIGUOUS(%(WShape)s))
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
+                %(fail)s
+            }
+{ //extra scope so that fail will not jump over declarations
+            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
+            const int outputChannels =  WShape[0];
+            const int inputChannels = %(V)s->dimensions[4];
+            if (WShape[4] != inputChannels)
+            {
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels); 
+                %(fail)s
+            }
+{ //extra scope so fail works
+            const int filterHeight = WShape[1];
+            const int filterWidth = WShape[2];
+            const int filterDur = WShape[3];
+            const int vidHeight = %(V)s->dimensions[1];
+            const int vidWidth = %(V)s->dimensions[2];
+            const int vidDur = %(V)s->dimensions[3];
+            if (vidHeight < filterHeight)
+            {
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W has a height of %%i but V is only %%i pixels tall", filterHeight, vidHeight);
+                %(fail)s
+            }
+            if (vidWidth < filterWidth)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a width of %%i but V is only %%i pixels tall",filterWidth,vidWidth);
+                %(fail)s
+            }
+            if (vidDur < filterDur)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: W has a duration of %%i but V is only %%i pixels long",filterDur,vidDur);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            //Read and check stride arguments
+            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
+            const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
+            const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
+            if (dr <= 0 || dc <= 0 || dt <= 0)
+            {
+                PyErr_Format(PyExc_ValueError,"ConvGrad3D: Strides should all be positive but they are %%i, %%i, %%i",dr,dc,dt);
+                %(fail)s
+            }
+{ // extra scope so fail works
+            //Compute correct sized of output
+            const int outputHeight = int( (vidHeight - filterHeight) / dr )+1;
+            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
+            const int outputDur = int( (vidDur - filterDur) / dt ) +1;
+            if (%(dCdH)s->dimensions[0] != batchSize || 
+                %(dCdH)s->dimensions[4] != outputChannels ||
+                %(dCdH)s->dimensions[1] != outputHeight ||
+                %(dCdH)s->dimensions[2] != outputWidth ||
+                %(dCdH)s->dimensions[3] != outputDur)
+            {
+                PyErr_Format(PyExc_ValueError, "dCdH is the wrong size, expected (%%i,%%i,%%i,%%i,%%i), got (%%li,%%li,%%li,%%li,%%li)", batchSize,  outputHeight, outputWidth, outputDur, outputChannels, %(dCdH)s->dimensions[0], %(dCdH)s->dimensions[1], %(dCdH)s->dimensions[2] ,%(dCdH)s->dimensions[3], %(dCdH)s->dimensions[4] );
+                %(fail)s
+            }
+{ // extra scope for fail
+            npy_intp dims[5];
+            dims[0] = outputChannels;
+            dims[4] = inputChannels;
+            dims[1] = filterHeight;
+            dims[2] = filterWidth;
+            dims[3] = filterDur;
+            if(!(%(dCdW)s)  || %(dCdW)s->dimensions[0]!=dims[0] || 
+                  %(dCdW)s->dimensions[1]!=dims[1] || 
+                  %(dCdW)s->dimensions[2]!=dims[2] || 
+                  %(dCdW)s->dimensions[3]!=dims[3] || 
+                  %(dCdW)s->dimensions[4]!=dims[4] ){
+               Py_XDECREF(%(dCdW)s);
+               %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
+               if (!(%(dCdW)s)) {
+                  PyErr_Format(PyExc_MemoryError,"ConvGrad3D: Could not allocate dCdW");
+                %(fail)s
+               }
+            }
+{ //extra scope so fail works
+            #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
+            #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
+            const int dhs3 = %(dCdH)s->strides[3];
+            const int dtvs3 = dt * %(V)s->strides[3];
+            // Compute dCdW
+            //TODO-- see if this can be made faster by using ELEM_AT instead of ELEM5
+            // dCdW[j,k,l,m,z] = sum_i sum_p sum_q sum_r dCdH[i,p,q,r,j]  *  V[i,dr*p+k,dc*q+l,dt*r+m,z]
+            for (int j = 0; j < outputChannels; j++) {
+                for (int z = 0; z < inputChannels; z++) {
+                    for (int k = 0; k < filterHeight; k++) {
+                        for (int l = 0; l < filterWidth; l++) {
+                            for (int m = 0; m < filterDur; m++) {
+                                //printf("writePos %%i %%i %%i %%i %%i \\n",j,k,l,m,z);
+                                dtype_%(dCdW)s & writePos =  ELEM5(%(dCdW)s, j,k,l,m,z);
+                                writePos = 0;
+                                for (int i = 0; i < batchSize; i++) {
+                                    for (int p = 0; p < outputHeight; p++) {
+                                        for (int q = 0; q < outputWidth; q++) {
+                                            int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ;
+                                            int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] +  (dr * p+k) * %(V)s->strides[1] +  (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3]; 
+                                            for (int r = 0; r < outputDur; r++) {    
+                                                writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
+                                                //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
+                                                Hpos += dhs3;
+                                                Vpos += dtvs3;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+}}}}}}} // extra scope for fail            
+            ///////////// < /code generated by ConvGradW3D >
+        """ 
+        return strutil.renderString(codeSource,locals()) 
+convGrad3D = ConvGrad3D()
+from Conv3D import conv3D
+from ConvTransp3D import convTransp3D
+if cuda_available:
+    from theano.sandbox.cuda import CudaNdarrayType, float32_shared_constructor
+    from . import GpuConvGrad3D
--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
+import numpy as N
+from theano.tensor import basic as T
+from util import strutil
+import theano
+from theano.sandbox.cuda import cuda_available, cuda_enabled
+class ConvTransp3D(theano.Op):
+    """ "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly defined matrix W. This implements multiplication by its transpose) """
+    def __eq__(self,other):
+	    return type(self) == type(other)
+    def __hash__(self):
+	    return hash(type(self))
+    def make_node(self, W, b, d, H, RShape = None):
+        """
+        :param W: Weights, filter
+        :param b: bias, shape == (W.shape[0],)
+        :param d: strides when moving the filter over the input
+        :param H: The output of Conv3D
+        """
+        W_ = T.as_tensor_variable(W)
+        b_ = T.as_tensor_variable(b)
+        d_ = T.as_tensor_variable(d)
+        H_ = T.as_tensor_variable(H)
+        if RShape:
+            RShape_ = T.as_tensor_variable(RShape)
+        else:
+            RShape_ = T.as_tensor_variable([-1,-1,-1])
+        return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], outputs = [ T.TensorType(H_.dtype, (False,False,False,False,False))() ] )
+    def c_compile_args(self):
+        flags = ['-Werror']
+        return flags
+    def infer_shape(self, node, input_shapes):
+        W,b,d,H,RShape = node.inputs
+        W_shape, b_shape, d_shape, H_shape, RShape_shape = input_shapes
+        return [(H_shape[0], W_shape[1], RShape[0], RShape[1], RShape[2])]
+    def grad(self,inputs, output_gradients):
+		W,b,d,H, RShape = inputs
+		dCdR ,= output_gradients
+		dCdH = conv3D( dCdR, W, T.zeros_like(H[0,0,0,0,:]), d)
+		WShape = W.shape
+		dCdW = convGrad3D(dCdR,d,WShape,H)
+		dCdb = T.sum(dCdR,axis=(0,1,2,3))
+		dCdd = None #not differentiable, since d is not continuous
+		dCdRShape = None #not differentiable, since RShape is not continuous
+		return [ dCdW,  dCdb, dCdd, dCdH, RShape ]
+    def perform(self, node, inputs, output_storage):
+		W, b, d, H, RShape = inputs
+		print "\t\t\t\tConvTransp3D python code"
+		output_storage[0][0] = computeR(W,b,d,H,RShape)
+    def c_code(self, node, nodename, (W, b, d, H, RShape), outputs, sub):
+        fail = sub['fail']
+        R = outputs[0]
+        codeSource = """          
+		    ///////////// < code generated by ConvTransp3D >
+		    //printf("\t\t\t\tConvTransp3D c code\\n");
+		    //Check dimensionality of inputs
+		    if (%(H)s->nd != 5)
+		    {
+                        PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd); 
+                        %(fail)s
+		    }
+		    if (%(W)s->nd != 5)
+		    {
+		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor"); 
+                %(fail)s
+		    }
+		    if (%(b)s->nd != 1)
+		    {
+                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector"); 
+                         %(fail)s
+		    }
+		    if (%(d)s->nd != 1)
+		    {
+		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector"); 
+                         %(fail)s
+		    }
+		    //Read and check stride arguments
+		    if (%(d)s->dimensions[0] != 3)
+		    {
+		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: 3 stride length arguments expected (for row, col, and time) but %%li were given", %(d)s->dimensions[0] );
+			 %(fail)s
+		    }
+                    { // for fail 1
+                         int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
+                         int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
+                         int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
+                         if (dr <= 0 || dc <= 0 || dt <= 0)
+                         {
+                             PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); 
+                             %(fail)s
+	                  }
+			 //Read and check sizes of inputs
+                        { // for fail 2
+			    const int batchSize = %(H)s->dimensions[0];
+			    const int outputChannels =  %(W)s->dimensions[0];
+			    if (%(H)s->dimensions[4] != outputChannels)
+			    {
+                                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li,%%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)",outputChannels,%(H)s->dimensions[4], %(W)s->dimensions[0], %(W)s->dimensions[1], %(W)s->dimensions[2], %(W)s->dimensions[3], %(W)s->dimensions[4], %(H)s->dimensions[0], %(H)s->dimensions[1], %(H)s->dimensions[2], %(H)s->dimensions[3], %(H)s->dimensions[4]); 
+                                %(fail)s
+			    }
+                            { // for fail 3
+                                const int inputChannels = %(W)s->dimensions[4];
+			        if (%(b)s->dimensions[0] != inputChannels)
+			        {
+                                    PyErr_Format(PyExc_ValueError, "ConvTransp3D: b operates on a %%li channel image but the image has %%i channels", %(b)s->dimensions[0], inputChannels );
+				    %(fail)s
+			        }
+                                { // for fail 4
+			        const int filterHeight = %(W)s->dimensions[1];
+			        const int filterWidth = %(W)s->dimensions[2];
+			        const int filterDur = %(W)s->dimensions[3];
+			        const int outputHeight = %(H)s->dimensions[1];
+			        const int outputWidth = %(H)s->dimensions[2];
+			        const int outputDur = %(H)s->dimensions[3];
+			        int videoHeight = (outputHeight-1) * dr + filterHeight;
+			        int videoWidth = (outputWidth-1) * dc + filterWidth;
+			        int videoDur = (outputDur-1) * dt + filterDur;
+	                        if (%(RShape)s)
+	                        {
+	                            if (%(RShape)s->nd != 1)
+		                    {
+                                        PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector"); 
+                                        %(fail)s
+		                    }
+		                    if (%(RShape)s->dimensions[0] != 3)
+		                    {
+                                        PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
+                                        %(fail)s
+		                    }
+                                    dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
+                                    dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
+                                    dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
+		                    if (RShape0 != -1)
+		                    {
+		                        if (RShape0 < videoHeight || RShape1 < videoWidth || RShape2 < videoDur)
+		                        {
+                                            PyErr_Format(PyExc_ValueError, "Reconstruction must have physical shape of at least [%%i,%%i,%%i] but RShape argument requests that it be [%%i,%%i,%%i]\\n",videoHeight,videoWidth,videoDur,(int) RShape0,(int) RShape1,(int) RShape2);
+                                            %(fail)s
+		                        }
+		                        videoHeight = RShape0;
+		                        videoWidth = RShape1;
+		                        videoDur = RShape2;
+		                   }
+	                       } //closes if RShape
+                               { // for fail 5
+                                   //Allocate the reconstruction
+		                   npy_intp dims[5];
+		                   dims[0] = batchSize;
+		                   dims[4] = inputChannels;
+		                   dims[1] = videoHeight;
+		                   dims[2] = videoWidth;
+		                   dims[3] = videoDur;
+                                   if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] || 
+                                    %(R)s->dimensions[1]!=dims[1] || 
+                                    %(R)s->dimensions[2]!=dims[2] || 
+                                    %(R)s->dimensions[3]!=dims[3] || 
+                                    %(R)s->dimensions[4]!=dims[4])
+                                   {
+		                       Py_XDECREF(%(R)s);
+			               %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num);
+			               if (!(%(R)s)) {
+				           PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R"); 
+                                           %(fail)s
+			               }
+                                   }
+                                   for (int i = 0; i < 3; i++)
+                                       if (%(R)s->strides[i] < %(R)s->strides[4])
+                                       {
+                                           PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)"); 
+                                           %(fail)s
+                                       }
+                                   { // for fail 6
+		                       memset(%(R)s->data, 0,  (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
+                                          (videoHeight-1) * %(R)s->strides[1] +
+                                          (videoWidth-1)  * %(R)s->strides[2] +
+                                          (videoDur-1)    * %(R)s->strides[3]);
+	 	                       #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
+		                       #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
+		                       dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data;
+                                       int rs4 = %(R)s->strides[4];
+                                       int ws0 = %(W)s->strides[0];
+                                       int ws4 = %(W)s->strides[4];
+                                       int hs4 = %(H)s->strides[4];
+		                       // Compute R
+		                       // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k] 
+		                       for (int i = 0; i < batchSize; i++) {
+		                        for (int r = 0; r < videoHeight; r++) {
+                                         const int frc = std::max(0.0, ceil(float(r-filterHeight+1)/float(dr)));
+			                 for (int c = 0; c < videoWidth; c++) {
+				          const int fcc = std::max(0.0, ceil(float(c-filterWidth +1)/float(dc)));
+			                  for (int t = 0; t < videoDur; t++) {
+				           const int ftc = std::max(0.0, ceil(float(t-filterDur +1)  /float(dt))); 
+                                           long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3];
+                                           long long Rpos = Rpost;
+                                           for (int j = 0; j < inputChannels; j++)
+                                           {  
+                                            //ELEM5(%(R)s, i,r,c,t,j) = b[j];
+                                            ELEM_AT(%(R)s,Rpos) = b[j];
+                                            Rpos += rs4;
+                                           }
+				           for (int rc = frc; rc < outputHeight; rc++) {
+				            const int rk = r - rc * dr;
+		                            if (rk < 0) break;
+				            for (int cc = fcc; cc < outputWidth; cc++) {
+			                     const int ck = c - cc * dc;
+			                     if (ck < 0) break;
+		                             for (int tc = ftc; tc < outputDur; tc++)
+                                             {
+			                      const int tk = t - tc * dt;
+			                      if (tk < 0) break;
+	                                      int Wpos = rk * %(W)s->strides[1] +  ck * %(W)s->strides[2] + tk * %(W)s->strides[3];
+		                              int Hpostc = i * %(H)s->strides[0] + 	rc * %(H)s->strides[1] +  cc * %(H)s->strides[2] + tc * %(H)s->strides[3];                            
+                                              Rpos = Rpost;
+                                              for (int j = 0; j < inputChannels; j++)
+                                              {
+                                               int Wposj = Wpos;    
+                                               dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);
+                                               int Hpos = Hpostc;
+                                               for (int k = 0; k < outputChannels; k++) {	
+ 		                                //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
+                                                //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
+			                        //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
+                                                writePos  += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);	                                    
+                                                Wpos += ws0;
+                                                Hpos += hs4;
+			                       } //close the k loop
+                                               Rpos += rs4;
+                                               Wpos = Wposj +  ws4;
+		                              } //close the j loop
+		                             } // close the tc loop
+		                            } //cc
+		                           } //rc
+	                                  } //t
+			                 } //c
+	                                } //r
+	       	                       } //i
+                                   } //for fail 6
+                               } //for fail 5
+                           } //for fail 4
+                       } //for fail 3
+                   } //for fail 2
+               } // for fail 1
+	       ///////////// < /code generated by ConvTransp3D >
+                     """ 
+        return strutil.renderString(codeSource,locals()) 
+convTransp3D = ConvTransp3D()
+#If the input size wasn't a multiple of D we may need to cause some automatic padding to get the right size of reconstruction
+def computeR(W,b,d,H,Rshape = None):
+		assert len(W.shape) == 5
+		assert len(H.shape) == 5
+		assert len(b.shape) == 1
+		assert len(d) == 3		
+		outputChannels,  filterHeight, filterWidth, filterDur, inputChannels = W.shape
+		batchSize, outputHeight, outputWidth, outputDur, outputChannelsAgain = H.shape
+		assert outputChannelsAgain == outputChannels
+		assert b.shape[0] == inputChannels
+		dr,dc,dt = d
+		assert dr > 0
+		assert dc > 0
+		assert dt > 0
+		videoHeight = (outputHeight-1) * dr + filterHeight
+		videoWidth = (outputWidth-1) * dc + filterWidth
+		videoDur = (outputDur-1) * dt + filterDur
+		if Rshape != None and Rshape[0] != -1:
+			if Rshape[0] < videoHeight:
+				print (Rshape[0], videoHeight)
+				assert False
+			assert Rshape[1] >= videoWidth
+			assert Rshape[2] >= videoDur
+			#print "setting video size to Rshape = "+str(Rshape)	
+			videoHeight, videoWidth, videoDur = Rshape
+		#else:
+		#	print "No Rshape passed in"
+		#print "video size: "+str((videoHeight, videoWidth, videoDur))	
+		R =  N.zeros( (batchSize, videoHeight,
+			videoWidth, videoDur, inputChannels ) , dtype=H.dtype)
+		#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] 
+		for i in xrange(0,batchSize):
+			#print '\texample '+str(i+1)+'/'+str(batchSize)
+			for j in xrange(0,inputChannels):
+				#print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
+				for r in xrange(0,videoHeight):
+					#print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
+					for c in xrange(0,videoWidth):
+						for t in xrange(0,videoDur):
+							R[i,r,c,t,j] = b[j]
+							ftc = max([0, int(N.ceil(float(t-filterDur +1  )/float(dt))) ])
+							fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])
+							rc =  max([0, int(N.ceil(float(r-filterHeight+1)/float(dr))) ])
+							while rc < outputHeight:
+								rk = r - rc * dr
+								if rk < 0:
+									break
+								cc = fcc
+								while cc < outputWidth:
+									ck = c - cc * dc
+									if ck < 0:
+										break
+									tc = ftc
+									while tc < outputDur:
+										tk = t - tc * dt
+										if tk < 0:
+											break
+										R[i,r,c,t,j] += N.dot(W[:,rk,ck,tk,j], H[i,rc,cc,tc,:] )
+										tc += 1
+									"" #close loop over tc
+									cc += 1
+								"" #close loop over cc
+								rc += 1
+							"" #close loop over rc
+						"" #close loop over t
+					"" #close loop over c
+				"" #close loop over r
+			"" #close loop over j
+		"" #close loop over i
+		return R
+from Conv3D import conv3D
+from ConvGrad3D import convGrad3D
+if cuda_available:
+    import GpuConvTransp3D
--- a/theano/tensor/nnet/__init__.py
+++ b/theano/tensor/nnet/__init__.py
 from nnet import *
+from Conv3D import *
+from ConvGrad3D import *
+from ConvTransp3D import *
 from sigm import softplus, sigmoid, sigmoid_inplace, scalar_sigmoid