Big whitespace cleanup.

2681accc · David Warde-Farley · b7a6e812 · 2681accc · 2681accc · 2681accc
--- a/theano/sandbox/cuda/GpuConv3D.py
+++ b/theano/sandbox/cuda/GpuConv3D.py
@@ -11,13 +11,13 @@ if cuda_available:

 class GpuConv3D(theano.Op):
    """ GPU implementation of Conv3D """
-    
+
    def __eq__(self, other):
        return type(self) == type(other)
-    
+
    def __hash__(self):
        return hash(type(self))
-    
+
    def __str__(self):
        return '%s' % (self.__class__.__name__)

@@ -33,16 +33,16 @@ class GpuConv3D(theano.Op):
        b_ = as_cuda_ndarray_variable(b)
        d_ = T.as_tensor_variable(d)

-        return theano.Apply(self, inputs=[V_, W_, b_, d_], 
+        return theano.Apply(self, inputs=[V_, W_, b_, d_],
                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(V_.broadcastable[0],W_.broadcastable[0],False,False,False))() ] )

    def c_code_cache_version(self):
        return ()
    def c_code(self, node, nodename, (V,W,b,d), outputs, sub):
        fail = sub['fail']
-        
+
        H = outputs[0]
-        
+
        codeSource =  """
 			///////////// < code generated by GpuConv3D >

@@ -51,31 +51,31 @@ class GpuConv3D(theano.Op):
 			//Check dimensionality of inputs
 			if (%(W)s->nd != 5)
 			{
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray"); 
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: W must be a 5 dimensional CudaNdarray");
                            %(fail)s
 			}

 			if (%(V)s->nd != 5)
 			{
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray"); 
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: V must be a 5 dimensional CudaNdarray");
                            %(fail)s
 			}

 			if (%(b)s->nd != 1)
 			{
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray"); 
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: b must be a vector CudaNdarray");
                            %(fail)s
 			}

 			if (%(d)s->nd != 1)
 			{
-PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray"); 
+PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
                            %(fail)s

 			}
 			if (%(d)s->dimensions[0] != 3)
 			{
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]); 
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: 3 stride length arguments expected (row, col, time) but %%li were given", %(d)s->dimensions[0]);
                            %(fail)s

 			}
@@ -87,7 +87,7 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
 			const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
 			if (CudaNdarray_HOST_DIMS(%(W)s)[4] != inputChannels)
 			{
-                            PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels); 
+                            PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%i channel image but the image has %%i channels",CudaNdarray_HOST_DIMS(%(W)s)[4],inputChannels);
                            %(fail)s
 			}
 {  //extra scope so error handler jumps don't cause errors
@@ -115,14 +115,14 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
                %(fail)s
            }
 { // extra scope so fail works
-			
+
 			//Read and check stride arguments
                        const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
                        const int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
                        const int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
 			if (dr <= 0 || dc <= 0 || dt <= 0)
 			{
-                PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt); 
+                PyErr_Format(PyExc_ValueError, "GpuConv3D: Strides must all be positive but are %%i, %%i, %%i", dr, dc, dt);
                %(fail)s
 			}
 { // extra scope so fail works
@@ -139,16 +139,16 @@ PyErr_Format(PyExc_ValueError, "GpuConv3D: d must be a vector CudaNdarray");
 			dims[2] = outputWidth;
 			dims[3] = outputDur;

-			if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] || 
-			CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] || 
-			CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] || 
-			CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] || 
+			if(!(%(H)s) || CudaNdarray_HOST_DIMS(%(H)s)[0]!=dims[0] ||
+			CudaNdarray_HOST_DIMS(%(H)s)[1]!=dims[1] ||
+			CudaNdarray_HOST_DIMS(%(H)s)[2]!=dims[2] ||
+			CudaNdarray_HOST_DIMS(%(H)s)[3]!=dims[3] ||
 			CudaNdarray_HOST_DIMS(%(H)s)[4]!=dims[4]){
 				Py_XDECREF(%(H)s);
 				%(H)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
 				if (!(%(H)s)) {

-                    PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output"); 
+                    PyErr_Format(PyExc_MemoryError, "GpuConv3D: could not allocate output");
                            %(fail)s
 				}
 			}
@@ -180,7 +180,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
    //conv_rows_stack
    dim3 grid(outputHeight*outputWidth,batchSize*outputChannels);
    dim3 threads(outputDur);
-    
+
    int shared_size=0;
        conv_rows_stack<<<grid, threads, shared_size>>>(
        CudaNdarray_DEV_DATA(%(V)s), CudaNdarray_DEV_DATA(%(W)s), CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_DEV_DATA(%(H)s),
@@ -193,7 +193,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
            work_complete = true;
 	    if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
@@ -206,7 +206,7 @@ if(out_contiguous && !b_strided && (version==0||version==-1) && outputDur<=512 &
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConv3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
-        }      
+        }


 }
@@ -216,10 +216,10 @@ if(!work_complete){
            %(fail)s
 }

-}}}}}}} //extra scope so error handler jumps don't cross declarations    
+}}}}}}} //extra scope so error handler jumps don't cross declarations
 			///////////// < /code generated by GpuConv3D >
        """
-        return strutil.renderString(codeSource,locals()) 
+        return strutil.renderString(codeSource,locals())

    def c_support_code_apply(self, node, nodename):
        # This code is not sensitive to the ignore_border flag.
@@ -260,7 +260,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,
 	for (int k =0; k < kern_height; k++) {
 	  for (int l = 0; l < kern_wid; l++) {
 	    for (int m = 0; m < kern_dur; m++) {
-	      sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] * 
+	      sum += img[img_stride_ochannel*z+img_stride_row*k+img_stride_col*l+img_stride_frame*m] *
 		         kern[kern_stride_stack*z+kern_stride_row*k+kern_stride_col*l+kern_stride_frame*m];
            }
          }
@@ -278,7 +278,7 @@ conv_rows_stack( float* img, float* kern, float* bias, float* out,

            """

-        return codeSource#renderString(codeSource,locals()) 
+        return codeSource#renderString(codeSource,locals())

 gpu_convd = GpuConv3D()


--- a/theano/sandbox/cuda/GpuConvGrad3D.py
+++ b/theano/sandbox/cuda/GpuConvGrad3D.py
@@ -22,14 +22,14 @@ class GpuConvGrad3D(theano.Op):
        WShape_ = T.as_tensor_variable(WShape)
        dCdH_ = as_cuda_ndarray_variable(dCdH)

-        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], 
+        return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
                            outputs = [ CudaNdarrayType(dtype=V_.dtype, broadcastable=(False,)*5)()])

    def perform_(self, node, inputs, output_storage):
        V, d, WShape, dCdH = inputs
        print "GpuConvGrad3D python code (warning not updated to new format)"

-        #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m] 
+        #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]

        batchSize = dCdH.shape[0]
        outputFilters = dCdH.shape[1]
@@ -66,7 +66,7 @@ class GpuConvGrad3D(theano.Op):

        dCdW = outputs[0]

-        codeSource =  """          
+        codeSource =  """
            ///////////// < code generated by GpuConvGrad3D >

            //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
@@ -123,9 +123,9 @@ class GpuConvGrad3D(theano.Op):
            const int inputChannels = CudaNdarray_HOST_DIMS(%(V)s)[4];
            if (WShape[4] != inputChannels)
            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels); 
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%d channel image but the image has %%d channels",WShape[4],inputChannels);
                %(fail)s
-              
+
            }
 { //extra scope so fail works
            const int filterHeight = WShape[1];
@@ -149,7 +149,7 @@ class GpuConvGrad3D(theano.Op):
                PyErr_Format(PyExc_ValueError, "GpuConvGrad3D: W has a duration of %%i but V is only %%i pixels long", filterWidth, vidWidth);
                %(fail)s
            }
-            
+
 { // extra scope so fail works
            //Read and check stride arguments
            const int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
@@ -167,7 +167,7 @@ class GpuConvGrad3D(theano.Op):
            const int outputWidth = int( (vidWidth - filterWidth) / dc )+1;
            const int outputDur = int( (vidDur - filterDur) / dt ) +1;

-            if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize || 
+            if (CudaNdarray_HOST_DIMS(%(dCdH)s)[0] != batchSize ||
                CudaNdarray_HOST_DIMS(%(dCdH)s)[4] != outputChannels ||
                CudaNdarray_HOST_DIMS(%(dCdH)s)[1] != outputHeight ||
                CudaNdarray_HOST_DIMS(%(dCdH)s)[2] != outputWidth ||
@@ -185,10 +185,10 @@ class GpuConvGrad3D(theano.Op):
            dims[2] = filterWidth;
            dims[3] = filterDur;

-            if(!(%(dCdW)s)  || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] || 
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] || 
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] || 
-                  CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] || 
+            if(!(%(dCdW)s)  || CudaNdarray_HOST_DIMS(%(dCdW)s)[0]!=dims[0] ||
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[1]!=dims[1] ||
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[2]!=dims[2] ||
+                  CudaNdarray_HOST_DIMS(%(dCdW)s)[3]!=dims[3] ||
                  CudaNdarray_HOST_DIMS(%(dCdW)s)[4]!=dims[4] ){
               Py_XDECREF(%(dCdW)s);
               %(dCdW)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
@@ -219,7 +219,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
    //conv_rows_stack
    dim3 grid(WShape[0]*WShape[4],WShape[1]*WShape[2]);//outputHeight*outputWidth);
    dim3 threads(WShape[3]);
-    
+
    int shared_size=0;

        convgrad_rows_stack<<<grid, threads, shared_size>>>(
@@ -235,7 +235,7 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
            work_complete = true;
 	    if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
@@ -248,18 +248,18 @@ if(out_contiguous && (version==0||version==-1) && WShape[4]<=512 && !work_comple
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvGrad3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
-        }      
+        }

 }
 if(!work_complete){
            PyErr_Format(PyExc_RuntimeError, "ERROR: no implementations executed for this GpuConv3D!");
            %(fail)s
 }
-}}}}} // extra scope for fail            
+}}}}} // extra scope for fail
            ///////////// < /code generated by GpuConvGrad3D >
-        """ 
+        """

-        return strutls.render_string(codeSource,locals()) 
+        return strutls.render_string(codeSource,locals())

    def c_support_code_apply(self, node, nodename):
        # This code is not sensitive to the ignore_border flag.
@@ -329,7 +329,7 @@ convgrad_rows_stack( float* img, float* dCdH, float* dCdW,
                                            dCdW[j,z,k,l,m] += dCdH[i,j,p,q,r] * V[i,z,dr*p+k,dc*q+l,dt*r+m]
 */
 """
-        return codeSource#renderString(codeSource,locals()) 
+        return codeSource#renderString(codeSource,locals())

 gpu_conv_grad3d = GpuConvGrad3D()


--- a/theano/sandbox/cuda/GpuConvTransp3D.py
+++ b/theano/sandbox/cuda/GpuConvTransp3D.py
@@ -14,10 +14,10 @@ class GpuConvTransp3D(theano.Op):
    """ The gpu version of ConvTransp3D """
    def __eq__(self,other):
 	    return type(self) == type(other)
-    
+
    def __hash__(self):
 	    return hash(type(self))
-    
+
    def make_node(self, W, b, d, H, RShape = None):
        W_ = as_cuda_ndarray_variable(W)
        b_ = as_cuda_ndarray_variable(b)
@@ -27,9 +27,9 @@ class GpuConvTransp3D(theano.Op):
            RShape_ = T.as_tensor_variable(RShape)
        else:
            RShape_ = T.as_tensor_variable([-1,-1,-1])
-            
+
        return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_],
-                            outputs = [CudaNdarrayType(dtype=H_.dtype, 
+                            outputs = [CudaNdarrayType(dtype=H_.dtype,
                                                       broadcastable=(False,)*5)()])

    def infer_shape(self, node, input_shapes):
@@ -51,7 +51,7 @@ class GpuConvTransp3D(theano.Op):

 		R = outputs[0]

-		codeSource =  """          
+		codeSource =  """
 			///////////// < code generated by GpuConvTransp3D >

 			//printf("\t\t\t\tGpuConvTransp c code\\n");
@@ -59,25 +59,25 @@ class GpuConvTransp3D(theano.Op):
 			//Check dimensionality of inputs
 			if (%(H)s->nd != 5)
 			{
-                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd); 
+                PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
                %(fail)s
 			}

 			if (%(W)s->nd != 5)
 			{
-				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor"); 
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: W must be a 5-D tensor");
                %(fail)s
 			}

 			if (%(b)s->nd != 1)
 			{
-				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector"); 
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: b must be a vector");
                %(fail)s
 			}
-			
+
 			if (%(d)s->nd != 1)
 			{
-				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector"); 
+				PyErr_Format(PyExc_ValueError, "GpuConvTransp3D: d must be a vector");
                %(fail)s
 			}

@@ -106,7 +106,7 @@ class GpuConvTransp3D(theano.Op):

 			if (CudaNdarray_HOST_DIMS(%(H)s)[4] != outputChannels)
 			{
-                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]); 
+                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%i channels. W.shape: (%%i, %%i, %%i,%%i, %%i) H.shape: (%%i, %%i, %%i, %%i, %%i)",outputChannels,CudaNdarray_HOST_DIMS(%(H)s)[4], CudaNdarray_HOST_DIMS(%(W)s)[0], CudaNdarray_HOST_DIMS(%(W)s)[1], CudaNdarray_HOST_DIMS(%(W)s)[2], CudaNdarray_HOST_DIMS(%(W)s)[3], CudaNdarray_HOST_DIMS(%(W)s)[4], CudaNdarray_HOST_DIMS(%(H)s)[0], CudaNdarray_HOST_DIMS(%(H)s)[1], CudaNdarray_HOST_DIMS(%(H)s)[2], CudaNdarray_HOST_DIMS(%(H)s)[3], CudaNdarray_HOST_DIMS(%(H)s)[4]);
                %(fail)s
 			}
 { // for fail
@@ -173,10 +173,10 @@ class GpuConvTransp3D(theano.Op):
 			dims[2] = videoWidth;
 			dims[3] = videoDur;

-                        if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] || 
-                        CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] || 
-                        CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] || 
-                        CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] || 
+                        if(!(%(R)s) || CudaNdarray_HOST_DIMS(%(R)s)[0]!=dims[0] ||
+                        CudaNdarray_HOST_DIMS(%(R)s)[1]!=dims[1] ||
+                        CudaNdarray_HOST_DIMS(%(R)s)[2]!=dims[2] ||
+                        CudaNdarray_HOST_DIMS(%(R)s)[3]!=dims[3] ||
                        CudaNdarray_HOST_DIMS(%(R)s)[4]!=dims[4]){
 		            Py_XDECREF(%(R)s);
 			   %(R)s = (CudaNdarray*)CudaNdarray_NewDims(5,dims);
@@ -213,7 +213,7 @@ if(out_contiguous && (version==0||version==-1) && outputDur<=512 && !work_comple
    //conv_transp_rows_stack
    dim3 grid(batchSize * inputChannels, videoHeight * videoWidth);
    dim3 threads(videoDur);
-    
+
 HERE

    int shared_size=0;
@@ -230,7 +230,7 @@ HERE

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
+        if (cudaSuccess == sts)
        {
            work_complete = true;
 	    if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i, shared_size=%%i, nb_threads=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
@@ -243,7 +243,7 @@ HERE
            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for GpuConvTransp3D! (%%s)",
                    cudaGetErrorString(sts));
            %(fail)s
-        }      
+        }


 }
@@ -258,8 +258,8 @@ if(!work_complete){

 }}}}}} // for fail
 			///////////// < /code generated by GpuConvTransp3D >
-		""" 
-		return renderString(codeSource,locals()) 
+		"""
+		return renderString(codeSource,locals())

    def c_support_code_apply(self, node, nodename):
        # This code is not sensitive to the ignore_border flag.
@@ -358,9 +358,9 @@ def computeR(W,b,d,H,Rshape = None):
 		assert len(W.shape) == 5
 		assert len(H.shape) == 5
 		assert len(b.shape) == 1
-		assert len(d) == 3		
+		assert len(d) == 3
+

-	
 		outputChannels, inputChannels, filterHeight, filterWidth, filterDur = W.shape
 		batchSize, outputChannelsAgain, outputHeight, outputWidth, outputDur = H.shape
 		assert outputChannelsAgain == outputChannels
@@ -382,18 +382,18 @@ def computeR(W,b,d,H,Rshape = None):
 			assert Rshape[1] >= videoWidth
 			assert Rshape[2] >= videoDur

-			#print "setting video size to Rshape = "+str(Rshape)	
+			#print "setting video size to Rshape = "+str(Rshape)

 			videoHeight, videoWidth, videoDur = Rshape
 		#else:
 		#	print "No Rshape passed in"
-		
-		#print "video size: "+str((videoHeight, videoWidth, videoDur))	
-	
+
+		#print "video size: "+str((videoHeight, videoWidth, videoDur))
+
 		R =  N.zeros( (batchSize, inputChannels, videoHeight,
 			videoWidth, videoDur ) , dtype=H.dtype)

-		#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] 
+		#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
 		for i in xrange(0,batchSize):
 			#print '\texample '+str(i+1)+'/'+str(batchSize)
 			for j in xrange(0,inputChannels):
@@ -403,7 +403,7 @@ def computeR(W,b,d,H,Rshape = None):
 					for c in xrange(0,videoWidth):
 						for t in xrange(0,videoDur):
 							R[i,j,r,c,t] = b[j]
-	
+
 							ftc = max([0, int(N.ceil(float(t-filterDur +1  )/float(dt))) ])
 							fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])

@@ -424,7 +424,7 @@ def computeR(W,b,d,H,Rshape = None):
 										tk = t - tc * dt
 										if tk < 0:
 											break
-										
+
 										R[i,j,r,c,t] += N.dot(W[:,j,rk,ck,tk], H[i,:,rc,cc,tc] )

 										tc += 1
@@ -439,7 +439,7 @@ def computeR(W,b,d,H,Rshape = None):
 				"" #close loop over r
 			"" #close loop over j
 		"" #close loop over i
-	
+
 		return R



--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -27,15 +27,15 @@ from theano.tensor.blas import ldflags

 #partial C / partial b_j =  sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] ) * ( partial H[i,r,c,t,k] / partial b_j )
 # =  sum_i sum_k sum_r sum_c sum_t (partial C / partial H[i,r,c,t,k] )  * delta(k = j)
-# =  sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] ) 
+# =  sum_i sum_r sum_c sum_t (partial C / partial H[i,r,c,t,j] )


 #partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * (partial H[i,p,q,r,n] / partial W[j,k,l,m,z])
-# = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) * 
+# = partial C / partial W[j,k,l,m,z] = sum_i sum_n sum_p sum_q sum_r (partial C /partial H[i,p,q,r,n] ) *
 # (partial sum_s sum_u sum_v sum_a  W[n,a, s,u,v] V[i, dr*p+s,dc*q+u,dt*r+v, a] ) / partial W[j,k,l,m,z])
-# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) * 
+# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *
 # (partial sum_s sum_u sum_v sum_a W[j,a, s,u,v] V[i,dr*p+s,dc*q+u,dt*r+v,a] ) / partial W[j,k,l,m,z])
-# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *  V[i,dr*p+k,dc*q+l,dt*r+m,z] 
+# = partial C / partial W[j,k,l,m,z] = sum_i sum_p sum_q sum_r (partial C /partial H[i,p,q,r,j] ) *  V[i,dr*p+k,dc*q+l,dt*r+m,z]

 #derivatives wrt V unimplemented for now. derivatives wrt dr, dc, dt are undefined since dr, dc, dt are natural numbers.

@@ -76,7 +76,7 @@ class Conv3D(theano.Op):
        #print "dCdH.broadcastable"
        #quit(-1)
        #dCdH = printing.Print("dCdH = ",["shape"])
-        
+
        dCdV = ConvTransp3D.convTransp3D(W, T.zeros_like(V[0,0,0,0,:]), d, dCdH, V.shape[1:4] )
        WShape = W.shape
        dCdW = ConvGrad3D.convGrad3D(V,d,WShape,dCdH)
@@ -92,7 +92,7 @@ class Conv3D(theano.Op):
    def infer_shape(self, node, input_shapes):
        V,W,b,d = node.inputs
        V_shape, W_shape, b_shape, d_shape = input_shapes
-        
+
        dr = d[0]
        dc = d[1]
        dt = d[2]
@@ -103,12 +103,12 @@ class Conv3D(theano.Op):
        vidWidth = V_shape[2]
        filterWidth = W_shape[2]
        vidDur = V_shape[3]
-        filterDur = W_shape[3]        
+        filterDur = W_shape[3]

        output_height = T.floor( (vidHeight - filterHeight) / dr )+1
        output_width = T.floor( (vidWidth - filterWidth) / dc )+1
        output_dur = T.floor( (vidDur - filterDur) / dt ) +1
-       
+

        return [(batch_size,  output_height, output_width, output_dur, output_channels )]

@@ -133,9 +133,9 @@ class Conv3D(theano.Op):
        fail = sub['fail']

        H = outputs[0]
-        

-        codeSource =  """          
+
+        codeSource =  """
            ///////////// < code generated by Conv3D >

            //printf("\t\t\t\tConv3D c code\\n");
@@ -143,14 +143,14 @@ class Conv3D(theano.Op):
            //Check dimensionality of inputs
            if (%(W)s->nd != 5)
            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor"); 
+                PyErr_Format(PyExc_ValueError, "Conv3D: W must be a 5 dimensional tensor");
                            %(fail)s

            }

            if (%(V)s->nd != 5)
            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor"); 
+                PyErr_Format(PyExc_ValueError, "Conv3D: V must be a 5 dimensional tensor");
                            %(fail)s
            }

@@ -180,13 +180,13 @@ class Conv3D(theano.Op):

            if (%(W)s->dimensions[4] != inputChannels)
            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%li channel image but the image has %%i channels.",%(W)s->dimensions[4],inputChannels); 
+                PyErr_Format(PyExc_ValueError, "Conv3D: W operates on a %%li channel image but the image has %%i channels.",%(W)s->dimensions[4],inputChannels);
                %(fail)s
            }

            if (%(b)s->dimensions[0] != outputChannels)
            {
-                PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%li channel output image but the output has %%i channels",%(b)s->dimensions[0],outputChannels); 
+                PyErr_Format(PyExc_ValueError, "Conv3D: b adds to a(n) %%li channel output image but the output has %%i channels",%(b)s->dimensions[0],outputChannels);
                %(fail)s
            }

@@ -221,7 +221,7 @@ class Conv3D(theano.Op):
            }

 { // extra scope so fail works
-            
+
            //Read and check stride arguments
            const int dr = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,0);
            const int dc = *(dtype_%(d)s*) PyArray_GETPTR1(%(d)s,1);
@@ -249,10 +249,10 @@ class Conv3D(theano.Op):



-            if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] || 
-            %(H)s->dimensions[1]!=dims[1] || 
-            %(H)s->dimensions[2]!=dims[2] || 
-            %(H)s->dimensions[3]!=dims[3] || 
+            if(!(%(H)s) || %(H)s->dimensions[0]!=dims[0] ||
+            %(H)s->dimensions[1]!=dims[1] ||
+            %(H)s->dimensions[2]!=dims[2] ||
+            %(H)s->dimensions[3]!=dims[3] ||
            %(H)s->dimensions[4]!=dims[4]){
                Py_XDECREF(%(H)s);
                %(H)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
@@ -287,11 +287,11 @@ class Conv3D(theano.Op):
            // ex: filterDur == 1 && batchSize == 1 && dt = 1  (for SFA)
            // ex: inputChannels == 1 """

-        



-            
+
+


        #if the data types are not mixed, we can insert special case optimizations based on BLAS
@@ -326,7 +326,7 @@ class Conv3D(theano.Op):
                    long long Hposi = Hpos;
                    long long Vposi = Vpos;

- 
+
                    for (int r = 0;  r < outputHeight; r++) {
                      long long Hposr = Hpos;
                      long long Vposr = Vpos;
@@ -357,7 +357,7 @@ class Conv3D(theano.Op):

                            dtype_%(H)s * writePos = & ELEM_AT(%(H)s,Hpos);

-                            
+
                            for (int k =0; k < filterHeight; k++) {
                                  int Wposk = Wpos;
                                  long long Vposk = Vpos;
@@ -368,7 +368,7 @@ class Conv3D(theano.Op):

                                      //H[i,r,c,t,:] += N.dot(W[:,k,l,m,:],V[i,dr*r+k,dc*c+l,dt*t+m,:])

-                                  
+
                                      //note: changing the weights so that outputChannels and inputChannels were the last two rather than
                                      //the first and last elements did not speed this up, even for extremely large input sizes

@@ -395,11 +395,11 @@ class Conv3D(theano.Op):
                       Hpos = Hposr + %(H)s->strides[1];
                       Vpos = Vposr + %(V)s->strides[1] * dr;
                   } //closes r
-                   Hpos = Hposi + %(H)s->strides[0]; 
+                   Hpos = Hposi + %(H)s->strides[0];
                   Vpos = Vposi + %(V)s->strides[0];
              } //closes i

-                
+
            } //closes "lots of channels" special case code
            else
 """
@@ -414,7 +414,7 @@ class Conv3D(theano.Op):
                    long long Hposi = Hpos;
                    long long Vposi = Vpos;

- 
+
                    for (int r = 0;  r < outputHeight; r++) {
                      long long Hposr = Hpos;
                      long long Vposr = Vpos;
@@ -441,14 +441,14 @@ class Conv3D(theano.Op):
                                long long Hposj = Hpos;
                                long long Vposj = Vpos;
                                int Wposj = Wpos;
-                           
+
                                // H[i,r,c,t,j] = b[j]

                                dtype_%(H)s & writePos = ELEM_AT(%(H)s,Hpos);

-                            
+
                                writePos = ELEM_AT(%(b)s,bPos);
-                           
+

                                for (int k =0; k < filterHeight; k++) {
                                  int Wposk = Wpos;
@@ -462,9 +462,9 @@ class Conv3D(theano.Op):
                                      for (int z = 0; z < inputChannels; z++) {
                                        //H[i,r,c,t,j] += W[j,z,k,l,m] * V[i,dr*r+k, dc*c+l, dt*t+m,z]

-                                       
+
                                        writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(V)s,Vpos);
-                      
+
                                        Wpos += ws4;
                                        Vpos += vs4;
                                      } // close z
@@ -478,7 +478,7 @@ class Conv3D(theano.Op):
                                  Vpos = Vposk + %(V)s->strides[1];
                                } //close k

-                                
+
                              bPos += bs;
                              Wpos = Wposj + ws0;
                              Hpos = Hposj +  hs4;
@@ -495,15 +495,15 @@ class Conv3D(theano.Op):
                       Hpos = Hposr + %(H)s->strides[1];
                       Vpos = Vposr + %(V)s->strides[1] * dr;
                   } //closes r
-                   Hpos = Hposi + %(H)s->strides[0]; 
+                   Hpos = Hposi + %(H)s->strides[0];
                   Vpos = Vposi + %(V)s->strides[0];
              } //closes i
            } //closes general case code
-}}}}}}} //extra scope so error handler jumps don't cross declarations  
+}}}}}}} //extra scope so error handler jumps don't cross declarations
            ///////////// < /code generated by Conv3D >
-        """ 
+        """

-        return strutil.renderString(codeSource,locals()) 
+        return strutil.renderString(codeSource,locals())

 global conv3D
 conv3D = Conv3D()
@@ -514,7 +514,7 @@ def computeH(V,W,b,d):
        if len(b.shape) != 1:
            print b.shape
            assert False
-        assert len(d) == 3        
+        assert len(d) == 3

        batchSize = V.shape[0]
        outputChannels = W.shape[0]
@@ -539,7 +539,7 @@ def computeH(V,W,b,d):
        outputWidth = int( (vidWidth - filterWidth) / dy )+1
        outputDur = int( (vidDur - filterDur) / dt ) +1

-        
+
        H =  N.zeros( (batchSize,  outputHeight,
            outputWidth, outputDur, outputChannels ), dtype=V.dtype )

@@ -563,7 +563,7 @@ def computeH(V,W,b,d):
                                            v = V[i,d[0]*x+k, d[1]*y+l, d[2]*t+m,z]
                                            #if i == 0 and x == 0 and y == 0 and t == 0 and j == 0:
                                            #    print 'setting H[0] += '+str(w*v)+'   W['+str((j,z,k,l,m))+']='+str(w)+'   V['+str((i,d[0]*x+k,d[1]*y+l,d[2]*t+m,z))+']='+str(v)
-                                            H[i,x,y,t,j] += w * v  
+                                            H[i,x,y,t,j] += w * v
        return H



--- a/theano/tensor/nnet/ConvGrad3D.py
+++ b/theano/tensor/nnet/ConvGrad3D.py
@@ -33,14 +33,14 @@ class ConvGrad3D(theano.Op):
        dLdd = None #not differentiable, since d is not continuous
        dLdWShape = None #not differentiable, since d is not continuous
        dLdB = conv3D( C, dLdA, T.zeros_like(B[0,0,0,0,:]), d)
-        
+
        return [ dLdC, dLdd, dLdWShape, dLdB ]

    def perform(self, node, inputs, output_storage):
        V, d, WShape, dCdH = inputs
        print "ConvGradW3D python code"

-        #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m] 
+        #partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) *  V[i,z,dr*p+k,dc*q+l,dt*r+m]

        batchSize = dCdH.shape[0]
        outputFilters = dCdH.shape[4]
@@ -82,7 +82,7 @@ class ConvGrad3D(theano.Op):

        dCdW = outputs[0]

-        codeSource =  """          
+        codeSource =  """
            ///////////// < code generated by ConvGradW3D >

            //printf("\t\t\t\tConvGradW3D c code\\n");
@@ -90,13 +90,13 @@ class ConvGrad3D(theano.Op):
            //Check dimensionality of inputs
            if (%(dCdH)s->nd != 5)
            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor"); 
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: dCdH must be a 5 dimensional tensor");
                            %(fail)s
            }

            if (%(V)s->nd != 5)
            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor"); 
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: V must be a 5 dimensional tensor");
                %(fail)s
            }

@@ -131,16 +131,16 @@ class ConvGrad3D(theano.Op):
                PyErr_Format(PyExc_ValueError,"ConvGrad3D: WShape must be contiguous");
                %(fail)s
            }
-  
+
 { //extra scope so that fail will not jump over declarations
            dtype_%(WShape)s * WShape = (dtype_%(WShape)s *) %(WShape)s->data;
            const int outputChannels =  WShape[0];
            const int inputChannels = %(V)s->dimensions[4];
            if (WShape[4] != inputChannels)
            {
-                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels); 
+                PyErr_Format(PyExc_ValueError, "ConvGrad3D: W operates on a %%i channel image but the image has %%i channels",(int) WShape[1],inputChannels);
                %(fail)s
-              
+
            }
 { //extra scope so fail works
            const int filterHeight = WShape[1];
@@ -184,7 +184,7 @@ class ConvGrad3D(theano.Op):



-            if (%(dCdH)s->dimensions[0] != batchSize || 
+            if (%(dCdH)s->dimensions[0] != batchSize ||
                %(dCdH)s->dimensions[4] != outputChannels ||
                %(dCdH)s->dimensions[1] != outputHeight ||
                %(dCdH)s->dimensions[2] != outputWidth ||
@@ -202,10 +202,10 @@ class ConvGrad3D(theano.Op):
            dims[2] = filterWidth;
            dims[3] = filterDur;

-            if(!(%(dCdW)s)  || %(dCdW)s->dimensions[0]!=dims[0] || 
-                  %(dCdW)s->dimensions[1]!=dims[1] || 
-                  %(dCdW)s->dimensions[2]!=dims[2] || 
-                  %(dCdW)s->dimensions[3]!=dims[3] || 
+            if(!(%(dCdW)s)  || %(dCdW)s->dimensions[0]!=dims[0] ||
+                  %(dCdW)s->dimensions[1]!=dims[1] ||
+                  %(dCdW)s->dimensions[2]!=dims[2] ||
+                  %(dCdW)s->dimensions[3]!=dims[3] ||
                  %(dCdW)s->dimensions[4]!=dims[4] ){
               Py_XDECREF(%(dCdW)s);
               %(dCdW)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(V)s->descr->type_num);
@@ -241,9 +241,9 @@ class ConvGrad3D(theano.Op):
                                    for (int p = 0; p < outputHeight; p++) {
                                        for (int q = 0; q < outputWidth; q++) {
                                            int Hpos = i * %(dCdH)s->strides[0] + j * %(dCdH)s->strides[4] + p * %(dCdH)s->strides[1] + q * %(dCdH)s->strides[2] ;
-                                            int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] +  (dr * p+k) * %(V)s->strides[1] +  (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3]; 
+                                            int Vpos = i * %(V)s->strides[0] + z * %(V)s->strides[4] +  (dr * p+k) * %(V)s->strides[1] +  (dc*q+l) * %(V)s->strides[2] + m * %(V)s->strides[3];

-                                            for (int r = 0; r < outputDur; r++) {    
+                                            for (int r = 0; r < outputDur; r++) {
                                                writePos += ELEM5(%(dCdH)s,i,p,q,r,j) * ELEM5(%(V)s,i,dr*p+k,dc*q+l,dt*r+m,z);
                                                //writePos += ELEM_AT(%(dCdH)s,Hpos) * ELEM_AT(%(V)s,Vpos);
                                                Hpos += dhs3;
@@ -258,11 +258,11 @@ class ConvGrad3D(theano.Op):
                }
            }

-}}}}}}} // extra scope for fail            
+}}}}}}} // extra scope for fail
            ///////////// < /code generated by ConvGradW3D >
-        """ 
+        """

-        return strutil.renderString(codeSource,locals()) 
+        return strutil.renderString(codeSource,locals())


 convGrad3D = ConvGrad3D()

--- a/theano/tensor/nnet/ConvTransp3D.py
+++ b/theano/tensor/nnet/ConvTransp3D.py
@@ -8,10 +8,10 @@ class ConvTransp3D(theano.Op):
    """ "Transpose" of Conv3D (Conv3D implements multiplication by an implicitly defined matrix W. This implements multiplication by its transpose) """
    def __eq__(self,other):
 	    return type(self) == type(other)
-    
+
    def __hash__(self):
 	    return hash(type(self))
-    
+
    def make_node(self, W, b, d, H, RShape = None):
        """
        :param W: Weights, filter
@@ -27,7 +27,7 @@ class ConvTransp3D(theano.Op):
            RShape_ = T.as_tensor_variable(RShape)
        else:
            RShape_ = T.as_tensor_variable([-1,-1,-1])
-            
+
        return theano.Apply(self, inputs=[W_,b_,d_,H_, RShape_], outputs = [ T.TensorType(H_.dtype, (False,False,False,False,False))() ] )

    def c_compile_args(self):
@@ -62,7 +62,7 @@ class ConvTransp3D(theano.Op):

        R = outputs[0]

-        codeSource = """          
+        codeSource = """
 		    ///////////// < code generated by ConvTransp3D >

 		    //printf("\t\t\t\tConvTransp3D c code\\n");
@@ -70,25 +70,25 @@ class ConvTransp3D(theano.Op):
 		    //Check dimensionality of inputs
 		    if (%(H)s->nd != 5)
 		    {
-                        PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd); 
+                        PyErr_Format(PyExc_ValueError, "H must be a 5-D tensor but it is %%i-D",%(H)s->nd);
                        %(fail)s
 		    }

 		    if (%(W)s->nd != 5)
 		    {
-		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor"); 
+		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: W must be a 5-D tensor");
                %(fail)s
 		    }

 		    if (%(b)s->nd != 1)
 		    {
-                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector"); 
+                         PyErr_Format(PyExc_ValueError, "ConvTransp3D: b must be a vector");
                         %(fail)s
 		    }
-			
+
 		    if (%(d)s->nd != 1)
 		    {
-		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector"); 
+		         PyErr_Format(PyExc_ValueError, "ConvTransp3D: d must be a vector");
                         %(fail)s
 		    }

@@ -103,14 +103,14 @@ class ConvTransp3D(theano.Op):
                         int dr = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,0);
                         int dc = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,1);
                         int dt = *(dtype_%(d)s*)PyArray_GETPTR1(%(d)s,2);
-                               
+
                         if (dr <= 0 || dc <= 0 || dt <= 0)
                         {
-                             PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt); 
+                             PyErr_Format(PyExc_ValueError, "ConvTransp3D: Strides must all be positive but are %%i, %%i, %%i",dr,dc,dt);
                             %(fail)s
 	                  }
-                           
-                         
+
+
 			 //Read and check sizes of inputs

                        { // for fail 2
@@ -119,7 +119,7 @@ class ConvTransp3D(theano.Op):

 			    if (%(H)s->dimensions[4] != outputChannels)
 			    {
-                                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li,%%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)",outputChannels,%(H)s->dimensions[4], %(W)s->dimensions[0], %(W)s->dimensions[1], %(W)s->dimensions[2], %(W)s->dimensions[3], %(W)s->dimensions[4], %(H)s->dimensions[0], %(H)s->dimensions[1], %(H)s->dimensions[2], %(H)s->dimensions[3], %(H)s->dimensions[4]); 
+                                PyErr_Format(PyExc_ValueError, "W produces a %%i channel image but the image has %%li channels. W.shape: (%%li, %%li, %%li,%%li, %%li) H.shape: (%%li, %%li, %%li, %%li, %%li)",outputChannels,%(H)s->dimensions[4], %(W)s->dimensions[0], %(W)s->dimensions[1], %(W)s->dimensions[2], %(W)s->dimensions[3], %(W)s->dimensions[4], %(H)s->dimensions[0], %(H)s->dimensions[1], %(H)s->dimensions[2], %(H)s->dimensions[3], %(H)s->dimensions[4]);
                                %(fail)s
 			    }

@@ -150,16 +150,16 @@ class ConvTransp3D(theano.Op):
 	                        {
 	                            if (%(RShape)s->nd != 1)
 		                    {
-                                        PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector"); 
+                                        PyErr_Format(PyExc_ValueError, "ConvTransp3D: RShape must be a vector");
                                        %(fail)s
 		                    }
- 
+
 		                    if (%(RShape)s->dimensions[0] != 3)
 		                    {
                                        PyErr_Format(PyExc_ValueError, "RShape must specify a 3D shape ( [height,width,duration] )");
                                        %(fail)s
 		                    }
-	
+
                                    dtype_%(RShape)s RShape0 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,0);
                                    dtype_%(RShape)s RShape1 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,1);
                                    dtype_%(RShape)s RShape2 = *(dtype_%(RShape)s*)PyArray_GETPTR1(%(RShape)s,2);
@@ -188,16 +188,16 @@ class ConvTransp3D(theano.Op):
 		                   dims[2] = videoWidth;
 		                   dims[3] = videoDur;

-                                   if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] || 
-                                    %(R)s->dimensions[1]!=dims[1] || 
-                                    %(R)s->dimensions[2]!=dims[2] || 
-                                    %(R)s->dimensions[3]!=dims[3] || 
+                                   if(!(%(R)s) || %(R)s->dimensions[0]!=dims[0] ||
+                                    %(R)s->dimensions[1]!=dims[1] ||
+                                    %(R)s->dimensions[2]!=dims[2] ||
+                                    %(R)s->dimensions[3]!=dims[3] ||
                                    %(R)s->dimensions[4]!=dims[4])
                                   {
 		                       Py_XDECREF(%(R)s);
 			               %(R)s = (PyArrayObject *) PyArray_SimpleNew(5, dims, %(H)s->descr->type_num);
 			               if (!(%(R)s)) {
-				           PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R"); 
+				           PyErr_Format(PyExc_MemoryError, "ConvTransp3D: could not allocate R");
                                           %(fail)s
 			               }
                                   }
@@ -205,25 +205,25 @@ class ConvTransp3D(theano.Op):
                                   for (int i = 0; i < 3; i++)
                                       if (%(R)s->strides[i] < %(R)s->strides[4])
                                       {
-                                           PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)"); 
+                                           PyErr_Format(PyExc_ValueError, "ConvTransp3D: R must have the smallest stride in its last index, but it doesn't (if this is a problem, the only part of ConvTransp3D that depends on this conditions is the memset, so this is probably easy to fix)");
                                           %(fail)s
                                       }
-                                      
+
                                   { // for fail 6
-                                               
-                                        
+
+
 		                       memset(%(R)s->data, 0,  (batchSize-1) * %(R)s->strides[0]+ inputChannels * %(R)s->strides[4] +
                                          (videoHeight-1) * %(R)s->strides[1] +
                                          (videoWidth-1)  * %(R)s->strides[2] +
                                          (videoDur-1)    * %(R)s->strides[3]);
-                                             
-                                                  
-                                                    
+
+
+
 	 	                       #define ELEM5(x, i,j,k,l,m) * ( dtype_ ## x *) ( x->data + (i)*x->strides[0]+(j)*x->strides[1]+(k)*x->strides[2]+(l)*x->strides[3]+(m)*x->strides[4] )
 		                       #define ELEM_AT(x, i) * ( dtype_ ## x *) ( x->data + (i) )
-                                        
-                                      
-                                       
+
+
+
 		                       dtype_%(b)s * b = (dtype_%(b)s *) %(b)s->data;

                                       int rs4 = %(R)s->strides[4];
@@ -232,60 +232,60 @@ class ConvTransp3D(theano.Op):
                                       int hs4 = %(H)s->strides[4];

 		                       // Compute R
-		                       // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k] 
-			              
+		                       // R[i,r,c,t,j] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, rk, ck, tk,j] * H[i,rc,cc,tc,k]
+
 		                       for (int i = 0; i < batchSize; i++) {
 		                        for (int r = 0; r < videoHeight; r++) {
                                         const int frc = std::max(0.0, ceil(float(r-filterHeight+1)/float(dr)));
 			                 for (int c = 0; c < videoWidth; c++) {
 				          const int fcc = std::max(0.0, ceil(float(c-filterWidth +1)/float(dc)));
 			                  for (int t = 0; t < videoDur; t++) {
-				           const int ftc = std::max(0.0, ceil(float(t-filterDur +1)  /float(dt))); 
-                                          
+				           const int ftc = std::max(0.0, ceil(float(t-filterDur +1)  /float(dt)));
+
                                           long long Rpost = i * %(R)s->strides[0] + r * %(R)s->strides[1] + c * %(R)s->strides[2] + t * %(R)s->strides[3];
-                                           
+
                                           long long Rpos = Rpost;
                                           for (int j = 0; j < inputChannels; j++)
-                                           {  
+                                           {
                                            //ELEM5(%(R)s, i,r,c,t,j) = b[j];
                                            ELEM_AT(%(R)s,Rpos) = b[j];
                                            Rpos += rs4;
                                           }
-                                                
-                                           
+
+
 				           for (int rc = frc; rc < outputHeight; rc++) {
 				            const int rk = r - rc * dr;
 		                            if (rk < 0) break;
-                                                 
+
 				            for (int cc = fcc; cc < outputWidth; cc++) {
 			                     const int ck = c - cc * dc;
 			                     if (ck < 0) break;
-                                                 
+
 		                             for (int tc = ftc; tc < outputDur; tc++)
                                             {
 			                      const int tk = t - tc * dt;
 			                      if (tk < 0) break;
-                                           
+
 	                                      int Wpos = rk * %(W)s->strides[1] +  ck * %(W)s->strides[2] + tk * %(W)s->strides[3];
-		                              int Hpostc = i * %(H)s->strides[0] + 	rc * %(H)s->strides[1] +  cc * %(H)s->strides[2] + tc * %(H)s->strides[3];                            
+		                              int Hpostc = i * %(H)s->strides[0] + 	rc * %(H)s->strides[1] +  cc * %(H)s->strides[2] + tc * %(H)s->strides[3];
                                              Rpos = Rpost;
                                              for (int j = 0; j < inputChannels; j++)
                                              {
-                                               int Wposj = Wpos;    
+                                               int Wposj = Wpos;
                                               dtype_%(R)s & writePos = ELEM_AT(%(R)s,Rpos);
-                                               
+
                                               int Hpos = Hpostc;
-                                               
-                                               for (int k = 0; k < outputChannels; k++) {	
+
+                                               for (int k = 0; k < outputChannels; k++) {
 		                                //TODO-- it's probably bad in terms of cache that our inner loop is over the largest stride of W.... maybe OK since it's the smallest stride of H
                                                //writePos += ELEM5(%(W)s,k,rk,ck,tk,j) * ELEM5(%(H)s,i,rc,cc,tc,k);
 			                        //writePos += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
-                                               
-                                                writePos  += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);	                                    
-                                              
+
+                                                writePos  += ELEM_AT(%(W)s,Wpos) * ELEM_AT(%(H)s,Hpos);
+
                                                Wpos += ws0;
                                                Hpos += hs4;
-			                             
+
 			                       } //close the k loop
                                               Rpos += rs4;
                                               Wpos = Wposj +  ws4;
@@ -304,9 +304,9 @@ class ConvTransp3D(theano.Op):
                   } //for fail 2
               } // for fail 1
 	       ///////////// < /code generated by ConvTransp3D >
-                     """ 
+                     """

-        return strutil.renderString(codeSource,locals()) 
+        return strutil.renderString(codeSource,locals())


 convTransp3D = ConvTransp3D()
@@ -316,14 +316,14 @@ def computeR(W,b,d,H,Rshape = None):
 		assert len(W.shape) == 5
 		assert len(H.shape) == 5
 		assert len(b.shape) == 1
-		assert len(d) == 3		
+		assert len(d) == 3
+

-	
 		outputChannels,  filterHeight, filterWidth, filterDur, inputChannels = W.shape
 		batchSize, outputHeight, outputWidth, outputDur, outputChannelsAgain = H.shape
 		assert outputChannelsAgain == outputChannels
 		assert b.shape[0] == inputChannels
-        
+

 		dr,dc,dt = d
 		assert dr > 0
@@ -341,18 +341,18 @@ def computeR(W,b,d,H,Rshape = None):
 			assert Rshape[1] >= videoWidth
 			assert Rshape[2] >= videoDur

-			#print "setting video size to Rshape = "+str(Rshape)	
+			#print "setting video size to Rshape = "+str(Rshape)

 			videoHeight, videoWidth, videoDur = Rshape
 		#else:
 		#	print "No Rshape passed in"
-		
-		#print "video size: "+str((videoHeight, videoWidth, videoDur))	
-	
+
+		#print "video size: "+str((videoHeight, videoWidth, videoDur))
+
 		R =  N.zeros( (batchSize, videoHeight,
 			videoWidth, videoDur, inputChannels ) , dtype=H.dtype)

-		#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] 
+		#R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
 		for i in xrange(0,batchSize):
 			#print '\texample '+str(i+1)+'/'+str(batchSize)
 			for j in xrange(0,inputChannels):
@@ -362,7 +362,7 @@ def computeR(W,b,d,H,Rshape = None):
 					for c in xrange(0,videoWidth):
 						for t in xrange(0,videoDur):
 							R[i,r,c,t,j] = b[j]
-	
+
 							ftc = max([0, int(N.ceil(float(t-filterDur +1  )/float(dt))) ])
 							fcc = max([0, int(N.ceil(float(c-filterWidth +1)/float(dc))) ])

@@ -383,7 +383,7 @@ def computeR(W,b,d,H,Rshape = None):
 										tk = t - tc * dt
 										if tk < 0:
 											break
-										
+
 										R[i,r,c,t,j] += N.dot(W[:,rk,ck,tk,j], H[i,rc,cc,tc,:] )

 										tc += 1
@@ -398,7 +398,7 @@ def computeR(W,b,d,H,Rshape = None):
 				"" #close loop over r
 			"" #close loop over j
 		"" #close loop over i
-	
+
 		return R