included cuda_ndarray in theano to have everything in one package.

a68097aa · Frederic Bastien · 847c505b · a68097aa · a68097aa · a68097aa
--- a/theano/sandbox/cuda/__init__.py
+++ b/theano/sandbox/cuda/__init__.py
@@ -16,7 +16,7 @@ def debug(*msg):
    _logger.debug(_logger_name+'DEBUG: '+' '.join(str(m) for m in msg))


-# Compile type_support.cu
+# Compile cuda_ndarray.cu
 # This need that nvcc (part of cuda) is installed. If it is not, a warning is
 # printed and this module will not be working properly (we set `enable_cuda`
 # to False).
@@ -45,54 +45,27 @@ def set_cuda_disabled():
        warning('Cuda is disabled, cuda-based code will thus not be '
                'working properly')

-old_file = os.path.join(os.path.split(__file__)[0],'type_support.so')
-if os.path.exists(old_file):
-    os.remove(old_file)
-
+#cuda_ndarray compile and import
+sys.path.append(get_compiledir())
 try:
-    sys.path.append(get_compiledir())
-    from type_support.type_support import *
-
+    from cuda_ndarray.cuda_ndarray import *
 except ImportError:
-
    import nvcc_compiler
-
    if not nvcc_compiler.is_nvcc_available():
        set_cuda_disabled()

    if enable_cuda:
+        cuda_path = os.path.split(__file__)[0]
+        code = open(os.path.join(cuda_path, "cuda_ndarray.cu")).read()

-        cuda_path=os.path.split(old_file)[0]
-        code = open(os.path.join(cuda_path, "type_support.cu")).read()
-
-        loc = os.path.join(get_compiledir(),'type_support')
+        loc = os.path.join(get_compiledir(),'cuda_ndarray')
        if not os.path.exists(loc):
            os.makedirs(loc)
 
-        CUDA_NDARRAY=os.getenv('CUDA_NDARRAY')
-        include_dirs=[]
-        lib_dirs=[]
-    
-        if CUDA_NDARRAY:
-            include_dirs.append(CUDA_NDARRAY)
-            lib_dirs.append(CUDA_NDARRAY)
-        else:
-            import theano.sandbox
-            path = os.path.split(os.path.split(os.path.split(theano.sandbox.__file__)[0])[0])[0]
-            path2 = os.path.join(path,'cuda_ndarray')
-            if os.path.isdir(path2):
-                include_dirs.append(path2)
-                lib_dirs.append(path2)
-            else:
-                path = os.path.split(path)[0]
-                path2 = os.path.join(path,'cuda_ndarray')
-                include_dirs.append(path2)
-                lib_dirs.append(path2)
-
-        nvcc_compiler.nvcc_module_compile_str('type_support', code, location = loc, include_dirs=include_dirs, lib_dirs=lib_dirs, libs=['cuda_ndarray'])
-
-        from type_support.type_support import *
+        nvcc_compiler.nvcc_module_compile_str('cuda_ndarray', code, location = loc, include_dirs=[cuda_path], libs=['cublas'],
+                                              preargs=['-DDONT_UNROLL', '-O3'])

+        from cuda_ndarray.cuda_ndarray import *

 if enable_cuda:
    from theano.sandbox.cuda.type import CudaNdarrayType
@@ -118,7 +91,7 @@ def use(device=config.THEANO_GPU):
            device=0
        device=int(device)
        try:
-            cuda_ndarray.gpu_init(device)
+            gpu_init(device)
            handle_shared_float32(True)
            use.device_number = device
        except RuntimeError, e:

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -2,7 +2,7 @@ from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar
 import StringIO

-import cuda_ndarray
+import cuda_ndarray.cuda_ndarray as cuda
 from theano.sandbox.cuda.type import CudaNdarrayType

 class GpuDot22(Op):
@@ -187,7 +187,7 @@ class GpuConv(Op):
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])

    def perform(self, node, (img, kern), (out,)):
-        out[0] = cuda_ndarray.conv(img, kern, 
+        out[0] = cuda.conv(img, kern, 
                mode=self.border_mode,
                out=out[0],
                subsample=self.subsample,

--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
+#include"conv_kernel.cu"
+//we store the full image and the full kernel in the shared memory
+//each thread compute only one value for the output
+//thread block size=out_wid, out_len/nb_split
+//grid block size=batch_id
+//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+__global__ void
+conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    img+=img_len*img_wid*batch_id;//the good batch
+
+    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
+    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
+    __syncthreads();
+
+    for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
+      float sum = 0.0f;
+      int img_row = out_row;
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+	int inverse_row = (img_row-row);
+	if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
+
+	const float* idx_in=&d_img[inverse_row*img_wid];
+	const float* idx_kern=&d_kern[row*kern_wid];
+	int img_col = out_col;
+	int col=0,last=0;
+	for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
+	  if(last<0 ||last>=(img_wid))continue;//col outside the image	
+	  sum+=idx_in[last]*idx_kern[col];
+	}
+      }
+      out[batch_id*out_len*out_wid+//the output image
+	  out_row*out_wid+out_col] = sum;
+    }
+}
+
+//we store the full image and the full kernel in the shared memory
+//each thread compute only one value for the output
+//thread block size=out_wid, out_len
+//grid block size=batch_id, nkern
+//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+__global__ void
+conv_full_patch( float* img, float* kern, float* out,
+		 int img_len, int img_wid,
+		 int kern_len, int kern_wid, int nkern, int nstack)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
+    img+=img_len*img_wid*batch_id;//the good batch
+
+    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
+    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
+    __syncthreads();
+
+    float sum = 0.0f;
+
+    for (int row=0; row < kern_len; row++) {//loop over row
+      if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
+
+      const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
+      const float* idx_kern=&d_kern[row*kern_wid];
+      int col=0;
+      int max_col=kern_wid;
+      int img_col=out_col-kern_wid+1;
+      max_col=min(max_col,img_wid-img_col);
+      
+      if(img_col<0){col=-img_col;img_col+=col;}
+      for (; col < max_col; col++, img_col++) {//loop over col
+	sum+=idx_in[col]*idx_kern[col];
+      }
+    }
+    out[batch_id*out_wid*out_len*nkern+//the good batch
+	out_wid*out_len*blockIdx.y+//the output image
+	out_row*out_wid+out_col] = sum;
+}
+
+//we store the full image and the full kernel in the shared memory
+//each thread compute only one value for the output
+//thread block size=out_wid, out_len
+//grid block size=batch_id, nkern
+//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
+
+template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
+__global__ void
+conv_full_patch_stack( float* img, float* kern, float* out,
+		       int img_len, int img_wid,
+		       int kern_len, int kern_wid, int nkern, int nstack,
+		       int img_stride_col, int img_stride_row,
+		       int kern_stride_col, int kern_stride_row, 
+		       int kern_stride_stack, int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
+  float __shared__ *kern_, *img_;
+  extern __shared__ float s_data[];
+
+    const int batch_id = blockIdx.x;
+    const int nkern_id = blockIdx.y;
+
+
+    const int out_col = threadIdx.x;
+    const int out_row = threadIdx.y;
+    const int thread_id  = threadIdx.y*blockDim.x+ threadIdx.x;
+
+    float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+    kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
+    img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
+
+    float sum = 0.0f;
+
+    for (int stack = 0;stack<nstack;stack++){
+
+      load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
+      load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
+      __syncthreads();
+
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+	if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
+	const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
+	const float* idx_kern=&d_kern[row*kern_wid];
+	int col=0;
+	int max_col=kern_wid;
+	int img_col=out_col-kern_wid+1;
+	max_col=min(max_col,img_wid-img_col);
+
+	if(img_col<0){col=-img_col;img_col+=col;}
+	for (; col < max_col; col++, img_col++) {//loop over col
+	  sum+=idx_in[col]*idx_kern[col];
+	}
+      }
+      //Needed as not all thread finish at the same time the loop
+      //And we don't want to overwrite the shared memory.
+      __syncthreads();
+    }
+    out[batch_id*out_wid*out_len*nkern+//the good batch
+	out_wid*out_len*blockIdx.y+//the output image
+	out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
+ * I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
+ * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
+ * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
+ * template split: if true, each thread compute more then 1 output pixel.
+ * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
+ *          if you set split and low_mem to true, we will use the low_mem version!
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
+__global__ void
+conv_full_patch_stack_padded( float* img, float* kern, float* out,
+		  const int img_len, const int img_wid,
+		  const int kern_len, const int kern_wid,
+		  const int nkern, const int nstack,
+		  const int img_stride_col, const int img_stride_row,
+		  const int img_stride_stack, const int img_stride_batch,
+		  const int kern_stride_col, const int kern_stride_row,
+		  const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
+    batch_id = blockIdx.x;
+    kern_id = blockIdx.y;
+    nb_rows = blockDim.y;
+
+    // Thread index
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    const int thread_id  = ty*blockDim.x + tx;
+
+    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
+    float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
+
+    kern+=kern_stride_nkern*kern_id;//the good nkern
+    img+=img_stride_batch*batch_id;//the good batch
+
+    img_wid_valid=img_wid+2*kern_wid-2;
+
+    if(!split && !low_mem){
+      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
+      const int out_row = ty;//output row
+      float sum = 0.0f;
+      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+	     img+=img_stride_stack){
+	  __syncthreads();
+	load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
+				  thread_id,nb_thread_id,img_wid,img_len,
+				  img_stride_col, img_stride_row, kern_wid-1,
+				  c_contiguous);
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	__syncthreads();
+
+	for (int row=0; row < kern_len; row++) {//loop over row
+	  const float* idx_kern=&d_kern[row*kern_wid];
+	  const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+	  
+	  convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+	}
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+	  kern_id*out_wid*out_len+//the output image
+	  out_row*out_wid+out_col] = sum;
+    }else if(split && !low_mem){
+      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
+      __shared__ int out_len_max;
+      //TODO pass a parameter nb_split
+      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
+      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
+	float sum = 0.0f;
+	for (int stack = 0;stack<nstack;stack++){
+	  __syncthreads();
+	  //TODO: load only the part of the image needed or put the partial result in shared memory
+	  load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
+				    img+img_stride_stack*stack,
+				    thread_id,nb_thread_id,img_wid,img_len,
+				    img_stride_col, img_stride_row, kern_wid-1,
+				    c_contiguous);
+	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
+			 thread_id, nb_thread_id, kern_wid,kern_len,
+			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	  __syncthreads();
+
+	  for (int row=0; row < kern_len; row++) {//loop over row
+	    const float* idx_kern=&d_kern[row*kern_wid];
+	    const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+	    
+	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+	  }
+	  if(out_row<out_len)
+	    out[batch_id*out_wid*out_len*nkern+//the good batch
+		out_wid*out_len*kern_id+//the output image
+		out_row*out_wid+out_col] = sum;
+	}
+      }
+    }else{//low_mem version
+      //don't need to fill the last rows padding as this is done later.
+      fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
+      __shared__ int out_len_max;
+      //TODO pass a parameter nb_split
+      if(thread_id==0)
+	out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
+      __syncthreads();
+      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
+	  out_row+=nb_rows, out_row_iter++){
+	float sum = 0.0f;
+	for (int stack = 0;stack<nstack;stack++){
+	  __syncthreads();
+	  const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
+	  const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
+	  //we need to reload some row as when we change of out_row we lost the last load du to the stack.
+	  const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
+	  load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
+				    img+img_stride_stack*stack//the good stack image
+				    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
+				    thread_id,nb_thread_id,img_wid,
+				    len_to_load+previous_row,
+				    img_stride_col, img_stride_row, kern_wid-1,
+				    c_contiguous);
+	  //TODO: fill the last row padding only when needed.
+	  //We always fill the last rows padding event when not needed.
+	  int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
+	  row_to_fill = min(row_to_fill,kern_len-1);
+	  fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
+	       img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
+	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
+			 thread_id, nb_thread_id, kern_wid,kern_len,
+			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	  __syncthreads();
+
+	  for (int row=0; row < kern_len; row++) {//loop over row
+	    const float* idx_kern=&d_kern[row*kern_wid];
+	    const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
+	    
+	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+	  }
+	}
+	if(out_row<out_len)
+	  out[batch_id*out_wid*out_len*nkern+//the good batch
+	      out_wid*out_len*kern_id+//the output image
+	      out_row*out_wid+out_col] = sum;
+      }
+    }
+}
+
+template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
+{ 
+    return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
+}
+template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
+{ 
+    return 0;
+}
+template<int NSTACK>
+__global__ void
+conv_full_load_everything( float* img, float* kern, float* out,
+		 int img_len, int img_wid,
+		 int kern_len, int kern_wid, int nkern, int nstack,
+                 int img_stride_col, int img_stride_row,
+                 int img_stride_stack, int img_stride_batch,
+                 int kern_stride_col, int kern_stride_row, 
+                 int kern_stride_stack, int kern_stride_nkern)
+{
+    int __shared__ out_len, out_wid, nb_thread_id;
+    out_len = img_len + kern_len - 1;
+    out_wid = img_wid + kern_wid - 1;
+    nb_thread_id = blockDim.y*blockDim.x;
+
+    extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = threadIdx.y;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
+
+    img += blockIdx.x * img_stride_batch;//the good batch
+
+    // load the image to shared memory
+    for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
+    {
+        int stack = i / (img_wid*img_len);
+        int row = (i % (img_wid*img_len)) / img_wid;
+        int col = (i % (img_wid*img_len)) % img_wid;
+        d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
+    }
+
+    for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
+    {
+        // load the kernel into shared memory and flip it
+        for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
+        {
+            int stack = i / (kern_wid*kern_len);
+            int row = (i % (kern_wid*kern_len)) / kern_wid;
+            int col = (i % (kern_wid*kern_len)) % kern_wid;
+            d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
+               = kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
+        }
+        __syncthreads();
+
+        float sum = 0.0f;
+        for (int row=0; row < kern_len; ++row)
+        {
+            int irow = out_row - kern_len+1+row;
+            if (irow < 0 || irow > img_len) continue;
+            for (int col = 0; col < kern_wid; ++col)
+            {
+                int icol = out_col - kern_wid+1+col;
+                if (icol < 0 || icol > img_wid) continue;
+                if (NSTACK > 0)
+                {
+                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
+                            d_kern + row*kern_wid+col, kern_len*kern_wid);
+                }
+                else
+                {
+                    for (int stack = 0; stack < nstack; ++stack)
+                    {
+                        sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
+                    }
+                }
+            }
+        }
+        out[batch_id*out_wid*out_len*nkern+//the good batch
+            out_wid*out_len*kern_idx+//the output image
+            out_row*out_wid+out_col] = sum;
+        __syncthreads(); //don't start loading another kernel until we're done here
+    }
+}
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
+//implement the valid convolution only
+
+/*
+for (int iter_m=0; iter_m < Os[0]; iter_m++) {
+  // Reposition index into input image based on requested output size
+  int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
+  int new_m = (pos_m+dim_ker[0]-1);
+
+  for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
+    int pos_n=iter_n*%(self_dy)s;
+    %(type)s sum=0;
+
+    // Sum over kernel, if index into image is out of bounds
+    // fill with the value
+    for (int j=0; j < dim_ker[0]; j++) {
+      int inverse_row = (new_m-j);
+      const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
+      const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
+      int new_n = (pos_n+dim_ker[1]-1);
+      for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
+	sum+=idx_kern[k]*idx_in[last];
+      }
+    }//for j
+    out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
+  }//for n
+ }//for m
+*/
+#ifndef CONV_KERNEL_CU
+#define CONV_KERNEL_CU
+
+#include <stdio.h>
+
+/*
+#define CHECK_BANK_CONFLICTS 0
+#if CHECK_BANK_CONFLICTS
+#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
+#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
+#else
+#define AS(i, j) As[i][j]
+#define BS(i, j) Bs[i][j]
+#endif
+*/
+
+const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
+
+__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
+  if (nb_thread < 64)
+    {
+      if(flipped) 
+        //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
+        for(int i=thread_id;i<N;i+=nb_thread)
+          dst[i]=src[N - 1 - i];
+        //dst[N-1-i]=src[i];
+      else
+        for(int i=thread_id;i<N;i+=nb_thread)
+          dst[i]=src[i];
+    }
+  else
+    {
+      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
+      // Global memory:
+      //  <-------------------------------------->
+      //      A      A      A      A      A   // points of 128-bit alignment
+      //         dddddddddddddddddddddd       // layout of src in global memory
+      //      |--|                            // masked_src_offset
+      // 
+      if (thread_id < nb_thread)
+        {
+          const int masked_src_offset = MASKED_OFFSET(src);
+          for(int masked_i=thread_id; masked_i<N + masked_src_offset; masked_i+=nb_thread)
+            {
+              int i = masked_i - masked_src_offset;
+              if (i >= 0)
+                if (flipped)
+                  dst[N-1-i] = src[i];
+                else
+                  dst[i]=src[i];
+            }
+        }
+    }
+}
+
+/*
+ * We load from global memory to shared memory. The outer if is optimized away at compilation.
+ */
+__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
+			       int nb_thread, const int nb_col, const int nb_row,
+			       const int stride_col, const int stride_row,
+			       const bool flipped=false, const bool c_contiguous=true){
+  if(flipped && ! c_contiguous){
+    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+      dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
+  }else if(c_contiguous){
+    load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
+  
+  }else if(flipped){//c_contiguous==true
+    //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
+    int N=nb_col*nb_row;
+    for(int i=thread_id;i<N;i+=nb_thread)
+      dst[i]=src[N - 1 - i];
+    //dst[N-1-i]=src[i];
+  }else if(c_contiguous){//flipped==false
+    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread)
+      dst[i]=src[i];
+  }else{ // !flipped && !c_contiguous
+    /*
+    for(int i=thread_id;i<nb_row;i+=nb_thread){
+      float* s=&src[i*stride_row];
+      float* d=&dst[i*nb_col];
+      for(int j=thread_id;j<nb_col;i+=nb_thread)
+	//	dst[i*nb_col+j]=src[i*stride_row+j*stride_col];//dst[i]=src[i];
+	d[j]=s[j*stride_col];
+	}*/
+    /* We don't do this if as nvcc 2.3 take 2 more registers when we add the if
+       Why it do this?
+    if(stride_col==1 && stride_row==nb_col)
+      for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+	dst[i]=src[i];
+	else*/
+      for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+	dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
+
+  }
+
+}
+
+__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
+  for(int i=thread_id;i<N;i+=nb_thread)
+    dst[i]=value;
+}
+
+/*
+ * We load from global memory to shared memory. The outer if is optimized away at compilation.
+ * We put the image at the center of another one. Usefull to padd an image with 0.
+ */
+__device__ void load_padded_col_to_shared(float * dst, const float * src, 
+					  const int thread_id, const int nb_thread,
+					  const int nb_col, const int nb_row, 
+					  const int stride_col, const int stride_row,
+					  const int wid_pad, const bool c_contiguous=true){
+  if(c_contiguous){//flipped==false
+    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
+      int col=i%nb_col;
+      int row=i/nb_col;
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
+    }
+    
+  }else{
+    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
+      int col=i%nb_col;
+      int row=i/nb_col;
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
+    }
+  }
+
+}
+
+template<int i> __device__ float convolutionRowNoFlip(const float *data,
+						      const float *kern){
+  return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
+}
+
+template<> __device__ float convolutionRowNoFlip<0>(const float *data,
+						    const float *kern){
+    return 0;
+}
+
+template<int KERN_WIDTH>
+__device__ void convolutionRowNoFlip(float& sum,
+				     const float *data,
+				     const float *kern, const int kern_wid){
+  if(KERN_WIDTH>0)
+    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
+  else
+#pragma unroll 8
+    for (int col=0; col < kern_wid; col++) {//loop over col
+      sum+=data[col]*kern[col];
+    }
+}
+
+template<bool accumulate>
+__device__ void store_or_accumulate(float& dst,const float value ){
+  if(accumulate){
+    dst += value;
+  }else
+    dst = value;
+}
+
+
+/**
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * Don't implement the stack.
+ * each thread compute only one value for the output if split is false
+ * thread block size=out_wid, out_len(or less then out_len if split is true)
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template split: if true, each thread compute more then 1 output pixel
+ *                 When true, allow for output image bigger then 512 pixel.
+ *                 Use more registers.
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool split>
+__global__ void
+conv_patch( float* img, float* kern, float* out,
+	    int img_len, int img_wid, int kern_len, int kern_wid,
+	    int nkern, int nstack)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    __shared__ int batch_id, kern_id;
+    batch_id = blockIdx.x;
+    kern_id = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    const int thread_id  = ty*blockDim.x + tx;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    kern+=kern_len*kern_wid*nstack*kern_id;
+
+    img+=img_len*img_wid*(nstack*batch_id);
+
+    load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
+    load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
+    __syncthreads();
+
+    if(!split){
+      int out_row = ty;//output row
+      float sum = 0.0f;
+      for (int row=0; row < kern_len; row++) {//loop over row
+	const float* idx_kern=&d_kern[row*kern_wid];
+	const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+	  blockIdx.y*out_wid*out_len+//the output image
+	  out_row*out_wid+out_col] = sum;
+    }else{
+      for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
+	float sum = 0.0f;
+	for (int row=0; row < kern_len; row++) {//loop over row
+	  const float* idx_kern=&d_kern[row*kern_wid];
+	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	}
+	out[batch_id*out_wid*out_len*nkern+//the good batch
+	    kern_id*out_wid*out_len+//the output image
+	    out_row*out_wid+out_col] = sum;
+      }
+    }
+}
+
+/**
+ * As conv_patch, but implement the stack in the kernel.
+ * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * each thread compute only one value for the output if split==false else it compute more then 1 values
+ * thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template accumulate: if true, we add the result, else we override the result
+ * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
+ * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
+ * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
+ * template split: if true, each thread generate more then 1 output pixel, but use more registers.
+ * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
+ */
+template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern>
+__global__ void
+conv_patch_stack( float* img, float* kern, float* out,
+		  int img_len, int img_wid, int kern_len, int kern_wid,
+		  int nkern, int nstack, int img_stride_col,int img_stride_row,
+		  int img_stride_stack, int img_stride_batch,
+		  int kern_stride_col, int kern_stride_row,
+		  int kern_stride_stack, int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+    int kern_id = blockIdx.y;
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
+
+    if(!split){
+      kern+=kern_stride_nkern*kern_id;//the good nkern
+      img+=img_stride_batch*batch_id;//the good batch
+      float sum = 0.0f;
+      
+      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+	     img+=img_stride_stack){
+	load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
+		       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
+	if(preload_full_kern)
+	  load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+			 kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+
+	__syncthreads();
+	
+	for (int row=0; row < kern_len; row++) {//loop over row
+	  if(!preload_full_kern){
+	    int idx2;
+	    if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
+	    else idx2=(row)*kern_stride_row;
+	    load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
+			   kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+	    __syncthreads();	      
+	  }
+
+	  const float* idx_kern;
+	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+	  else idx_kern=d_kern;
+	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  
+	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	}
+	__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+      }
+      store_or_accumulate<accumulate>(
+				      out[batch_id*out_wid*out_len*nkern+//the good batch
+					  out_wid*out_len*kern_id+//the output image
+					  out_row*out_wid+out_col],sum);
+    }else{
+
+      float __shared__ *kern_, *img_;
+      int __shared__ out_len_max;
+
+      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
+      img_=img+img_stride_batch*batch_id;//the good batch
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
+      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
+
+      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
+      //TODO: do this happen elsewhere?
+      for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){
+	float sum = 0.0f;
+	for (int stack = 0;stack<nstack;stack++){
+	  //TODO: load only the part of the image needed or put the partial result in shared memory
+	  int idx1=img_stride_stack*stack;
+	  load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
+			 img_stride_col, img_stride_row, false, img_c_contiguous_2d);
+	  if(preload_full_kern){
+	    int idx2=kern_stride_stack*stack;
+	    load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
+			   kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+	  }
+	  __syncthreads();
+	  
+	  for (int row=0; row < kern_len; row++) {//loop over row
+	    if(!preload_full_kern){
+	      int idx2=kern_stride_stack*stack;
+	      if(flipped_kern)
+		idx2+=(kern_len-row-1)*kern_stride_row;
+	      else
+		idx2+=(row)*kern_stride_row;
+	      load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
+			     kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+	      __syncthreads();	      
+	    }
+	    const float* idx_kern;
+	    if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+	    else idx_kern=d_kern;
+	    const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	    
+	    convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	  }
+	  __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+	}
+	if(out_row<out_len)
+	  store_or_accumulate<accumulate>(
+					  out[batch_id*out_wid*out_len*nkern+//the good batch
+					      out_wid*out_len*kern_id+//the output image
+					      out_row*out_wid+out_col],sum);
+      }
+
+    }
+
+}
+
+/**
+ * As conv_patch_stack, but kern_len thread for each output pixel
+ * I keep it separated as use more register.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template img_contiguous: if true, the img have are collon and row contiguous
+ * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
+__global__ void
+conv_patch_stack_reduce( float* img, float* kern, float* out,
+		  int img_len, int img_wid, int kern_len, int kern_wid,
+		  int nkern, int nstack, int img_stride_col,int img_stride_row,
+		  int img_stride_stack, int img_stride_batch,
+		  int kern_stride_col, int kern_stride_row,
+		  int kern_stride_stack, int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int tz = threadIdx.z;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID];
+    float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
+    float sum = 0.0f;
+
+    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
+    img+=img_stride_batch*batch_id;//the good batch
+
+    for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+	   img+=img_stride_stack){
+      __syncthreads();
+      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
+		     img_stride_col, img_stride_row, false, c_contiguous);
+      if(!(split && ! preload_full_kern))
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+      __syncthreads();
+      if(split && ! preload_full_kern){
+	for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){
+	  int idx3;
+	  //TODO: test/check for flipped_kern
+	  if(flipped_kern)
+	    idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped
+	  else
+	    idx3=first_row;
+	  __syncthreads();
+	  load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z,
+			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	  __syncthreads();
+	  const float* idx_kern=&d_kern[tz*kern_stride_row];
+	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  float sum2 = 0;
+	  if(row<kern_len)
+	    convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
+	  sum+=sum2;
+	}
+      }else if(split){
+	for(int row=tz;row<kern_len;row+=blockDim.z){
+	  const float* idx_kern=&d_kern[row*kern_wid];
+	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	}
+      }else{
+	int row = tz;//The row of the kernel.
+	const float* idx_kern=&d_kern[row*kern_wid];
+	const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      }
+	__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+    }
+
+    //reduce
+    d_reduce[thread_id]=sum;
+    __syncthreads();
+    if(thread_id<out_len*out_wid){
+      sum=0;
+      for(int i=0;i<blockDim.z;i++){
+	sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y];
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+	  out_wid*out_len*blockIdx.y+//the output image
+	  out_row*out_wid+out_col] = sum;
+    }
+}
+
+/**
+ * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
+ * we store kern_len row of the image and the full kernel in the shared memory
+ * each thread compute only one value for the output
+ * Don't implement the stack and nkern in the kernel.
+ * thread block size=out_wid
+ * grid block size=out_len,batch_id
+ * dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
+ * Diff with conv_patch: don't store the full image in the shared memory. 
+ *    I.E. work for bigger image then conv_patch<split=true,...>.
+ */
+template<int KERN_WIDTH, bool c_contiguous>
+__global__ void
+conv_rows( float* img, float* kern, float* out,
+	   int img_len, int img_wid, int kern_len, int kern_wid,
+	   int nkern, int nstack,
+	   int img_stride_col, int img_stride_row,
+	   int img_stride_stack, int img_stride_batch,
+	   int kern_stride_col, int kern_stride_row,
+	   int kern_stride_stack, int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
+  float __shared__ *d_img, *d_kern;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y%nkern;
+
+  extern __shared__ float s_data[];
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = blockIdx.x;;//output row
+    const int thread_id = threadIdx.x;
+
+    d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
+    d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+    
+    img+=img_stride_batch*batch_id;//selection the good image from the batch
+    img+=out_row*img_stride_row;//select the good top row.
+    kern+=kern_stride_nkern*kern_id;//the good nkern
+
+    load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
+		   img_stride_col, img_stride_row, false, c_contiguous);
+    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+		   kern_stride_col, kern_stride_row, true, c_contiguous);
+
+    __syncthreads();
+    float sum = 0.0f;
+
+    for (int row=0; row < kern_len; row++) {//loop over row
+      const float* idx_kern=&d_kern[row*kern_wid];
+      const float* idx_in=&d_img[(row)*img_wid+out_col];
+      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+    }
+
+    out[batch_id*out_wid*out_len*nkern+//the good batch
+	kern_id*out_wid*out_len+//the output image
+	out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
+ * as conv_rows, but implement the stack. Separate as this use more register.
+ * we store kern_len row of the image and the full kernel in the shared memory
+ * each thread compute only one value for the output
+ * thread block size=out_wid, block_len
+ * grid block size=intceil(out_len/block_len),batch_id
+ * dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
+ * Diff with conv_patch: don't store the full image in the shared memory. 
+ *    I.E. work for bigger image then conv_patch<split=true,...>.
+ */
+template<int KERN_WIDTH, bool c_contiguous>
+__global__ void
+conv_rows_stack( float* img, float* kern, float* out,
+		 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
+		 const int nkern, const int nstack,
+		 const int img_stride_col, const int img_stride_row,
+		 const int img_stride_stack, const int img_stride_batch,
+		 const int kern_stride_col, const int kern_stride_row,
+		 const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
+  float  __shared__ *d_img, *d_kern;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y%nkern;
+  nb_rows = blockDim.y;
+
+  extern __shared__ float s_data[];
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
+    const int shared_row = threadIdx.y;
+    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
+
+    d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
+    d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+    
+    float sum = 0.0f;
+    for (int stack = 0;stack<nstack;stack++){
+
+      int _idx=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch
+      _idx+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
+    
+      load_to_shared(d_img,img+_idx,thread_id,nb_thread_id,img_wid,kern_len+nb_rows-1,
+		     img_stride_col, img_stride_row, false, c_contiguous);
+      _idx=kern_stride_nkern*kern_id+kern_stride_stack*stack;
+      load_to_shared(d_kern, kern+_idx, thread_id, nb_thread_id, kern_wid,kern_len,
+		     kern_stride_col, kern_stride_row, true, c_contiguous);
+    
+      __syncthreads();
+
+      for (int row=0; row < kern_len && out_row<out_len; row++) {//loop over row
+	const float* idx_kern=&d_kern[row*kern_wid];
+	const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
+	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      }
+      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
+    }
+    if(out_row<out_len)
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+	  kern_id*out_wid*out_len+//the output image
+	  out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
+ * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
+ * we store block_len row of the image(at a time) and one or all kernel row in the shared memory
+ * each thread compute only one value for the output
+ * thread block size=out_wid, block_len
+ * grid block size=intceil(out_len/block_len),batch_id
+ * dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
+ * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
+ *    I.E. work for bigger image then conv_patch<split=true,...>.
+ */
+template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
+__global__ void
+conv_rows_stack2( float* img, float* kern, float* out,
+		 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
+		 const int nkern, const int nstack,
+		 const int img_stride_col, const int img_stride_row,
+		 const int img_stride_stack, const int img_stride_batch,
+		 const int kern_stride_col, const int kern_stride_row,
+		 const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
+  float  __shared__ *d_img, *d_kern;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y%nkern;
+  nb_rows = blockDim.y;
+
+  extern __shared__ float s_data[];
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
+    const int shared_row = threadIdx.y;
+    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
+
+    d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
+    d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
+    
+    float sum = 0.0f;
+    for (int stack = 0;stack<nstack;stack++){
+
+      int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
+      _idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
+    
+      __syncthreads();
+      load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
+      		     img_stride_col, img_stride_row, false, c_contiguous);
+      if(preload_full_kern)
+	load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
+		       thread_id, nb_thread_id, kern_wid,kern_len,
+		       kern_stride_col, kern_stride_row, true, c_contiguous);
+      __syncthreads();
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+	int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
+	_idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
+	_idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
+
+	__syncthreads();
+	load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
+		       img+_idx1, thread_id, nb_thread_id, img_wid, 1,
+		       img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
+	if(!preload_full_kern){
+	  int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
+	  _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
+	  load_to_shared(d_kern, kern+_idx3,
+			 thread_id, nb_thread_id, kern_wid,1,
+			 kern_stride_col, kern_stride_row, true, c_contiguous);
+
+	}
+	__syncthreads();
+	if(out_row<out_len){
+	  const float* idx_kern;
+	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+	  else idx_kern=d_kern;
+	  const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
+	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+	}
+      }
+    }
+    __syncthreads();
+    if(out_row<out_len)
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+	  kern_id*out_wid*out_len+//the output image
+	  out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
+ * kernel-image inner-product in parallel.
+ * 
+ * This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
+ *
+ * template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
+ * TODO: explain parameters, preconditions
+ */
+template<bool stack_loop>
+__global__ void
+conv_valid_row_reduce(int nB, int nK, int stacklen,
+        int img_len, int img_wid, 
+        int kern_len, int kern_wid,
+        int out_len, int out_wid, //physical
+        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
+        int subsample_rows, int subsample_cols,
+        const int initial_reduce_boundary)
+{
+    const int outsize = nB * nK * out_len * out_wid;
+    extern __shared__ float reducebuf[];
+    for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
+    {
+        //figure out what output element we're in charge of computing
+        int ii = i;
+        int iB = ii % nB;      // output batch index
+        ii = ii / nB;
+        int iK = ii % nK;      // output kernel index
+        ii = ii / nK;
+        int iR_physical = ii % out_len; //output kernel row
+        int iC_physical = ii / out_len; // output kernel column
+        int iR_logical = iR_physical * subsample_rows;
+        int iC_logical = iC_physical * subsample_cols;
+
+        int ss = threadIdx.x;
+        int rr = threadIdx.y;
+        int img_rr = iR_logical + kern_len - 1 - rr;
+        int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
+        float sum = 0.0f;
+	if(stack_loop){
+	  for (; ss < stacklen; ss+=blockDim.x){
+            float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+            float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+            for (int cc = 0; cc < kern_wid; ++cc)
+            {
+                sum +=  kk_0[0] * ii_0[0];
+                kk_0 += kern_str_C;
+                ii_0 -= img_str_C;
+            }
+	  }
+	}else{
+	  float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+	  float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+	  for (int cc = 0; cc < kern_wid; ++cc)
+	  {
+	    sum +=  kk_0[0] * ii_0[0];
+	    kk_0 += kern_str_C;
+	    ii_0 -= img_str_C;
+          }
+	}
+
+        if (blockDim.x * blockDim.y == 1)
+        {
+            out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
+        }
+        else
+        {
+            reducebuf[reduceIdx] = sum;
+            __syncthreads();
+            int reduce_boundary = initial_reduce_boundary;
+
+            // add in the terms above the reduce boundary
+            if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
+                reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
+            reduce_boundary >>= 1;
+            // there are an equal number of terms above and below the reduce_boundary
+            while (reduce_boundary)
+            {
+                __syncthreads();
+                if (reduceIdx < reduce_boundary)
+                {
+                    reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
+                }
+                reduce_boundary >>= 1;
+            }
+            if (reduceIdx == 0)
+            {
+                out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
+            }
+        }
+    }
+}
+
+
+
+/**
+ * Reference implementation of 'valid' mode convolution (with stack)
+ * 
+ * This implementation works for any size of image and kernel.  It does not use shared memory.
+ *
+ * TODO: explain parameters, preconditions
+ */
+__global__ void
+conv_reference_valid(int nB, int nK, int stacklen,
+        int img_len, int img_wid, 
+        int kern_len, int kern_wid,
+        int out_len, int out_wid, //physical
+        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
+        int subsample_rows, int subsample_cols)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    __shared__ int numThreads, outsize;
+    numThreads = blockDim.x * gridDim.x;
+    outsize = nB * nK * out_len * out_wid;
+
+    for (int i = idx; i < outsize; i += numThreads)  //physical
+    {
+        //figure out what output element we're in charge of computing
+        int ii = i;
+        int iB = ii % nB;      // output batch index
+        ii = ii / nB;
+        int iK = ii % nK;      // output kernel index
+        ii = ii / nK;
+        int iR_physical = ii % out_len; //output kernel row
+        int iC_physical = ii / out_len; // output kernel column
+        int iR_logical = iR_physical * subsample_rows;
+        int iC_logical = iC_physical * subsample_cols;
+
+        float sum = 0.0f;
+        for (int ss = 0; ss < stacklen; ++ss)
+        {
+            for (int rr = 0; rr < kern_len; ++rr)
+            {
+                int img_rr = iR_logical + kern_len - 1 - rr;
+                for (int cc = 0; cc < kern_wid; ++cc)
+                {
+                    int img_cc = iC_logical + kern_wid-1-cc;
+                    float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
+                    float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
+                    sum +=  k_0 * i_0;
+                }
+            }
+        }
+        //coords[i*5+0] = iB;
+        //coords[i*5+1] = iK;
+        //coords[i*5+2] = iR;
+        //coords[i*5+3] = iC;
+        //coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
+        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
+    }
+}
+
+/**
+ * Reference implementation of 'full' mode convolution (with stack)
+ * 
+ * This implementation works for any size of image and kernel.  It does not use shared memory.
+ *
+ * TODO: explain parameters, preconditions
+ */
+__global__ void
+conv_reference_full(int nB, int nK, int stacklen,
+        int img_len, int img_wid, 
+        int kern_len, int kern_wid,
+        int out_len, int out_wid, //physical dimensions
+        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
+        int subsample_rows, int subsample_cols)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    __shared__ int numThreads, physical_outsize;
+    numThreads = blockDim.x * gridDim.x;
+    physical_outsize = nB * nK * out_len * out_wid;
+
+    for (int i = idx; i < physical_outsize; i += numThreads) 
+    {
+        //figure out what output element we're in charge of computing
+        int ii = i;
+        int iB = ii % nB;      // output batch index
+        ii = ii / nB;
+        int iK = ii % nK;      // output kernel index
+        ii = ii / nK;
+        int iR_physical = ii % out_len; //output kernel row
+        int iC_physical = ii / out_len; // output kernel column
+        int iR_logical = iR_physical * subsample_rows;
+        int iC_logical = iC_physical * subsample_cols;
+
+        float sum = 0.0f;
+        for (int ss = 0; ss < stacklen; ++ss)
+        {
+            for (int rr = 0; rr < kern_len; ++rr)
+            {
+                int img_rr = iR_logical - rr;
+                if ((img_rr >= 0) && (img_rr < img_len))
+                {
+                    for (int cc = 0; cc < kern_wid; ++cc)
+                    {
+                        int img_cc = iC_logical - cc;
+                        if ((img_cc >= 0) && (img_cc < img_wid))
+                        {
+                            float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
+                            float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
+                            sum +=  k_0 * i_0;
+                        }
+                    }
+                }
+            }
+        }
+        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
+    }
+}
+
+#endif // #ifndef CONV_KERNEL_CU
--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
+#ifndef _CUDA_NDARRAY_H
+#define _CUDA_NDARRAY_H
+
+#include <numpy/arrayobject.h>
+#include <stdio.h>
+
+#include <cublas.h>
+
+typedef float real;
+#define REAL_TYPENUM 11
+
+#ifdef __DEVICE_EMULATION__
+#define NUM_VECTOR_OP_BLOCKS                4096
+#define NUM_VECTOR_OP_THREADS_PER_BLOCK     1  //This prevents printf from getting tangled up
+#else
+#define NUM_VECTOR_OP_BLOCKS                4096 //Max number of blocks to launch.  Should be read from device properties. (#10)
+#define NUM_VECTOR_OP_THREADS_PER_BLOCK     256  //Should be read from device properties. (#10)
+#endif
+
+#if 0
+// Do not wait after every kernel & transfer.
+#define CNDA_THREAD_SYNC
+#else
+// This is useful for using normal profiling tools
+#define CNDA_THREAD_SYNC cudaThreadSynchronize();
+#endif
+
+
+/**
+ * struct CudaNdarray
+ *
+ * This is a Python type.  
+ *
+ */
+struct CudaNdarray 
+{
+    PyObject_HEAD
+
+    /**
+     * base:
+     *  either NULL or a pointer to a fellow CudaNdarray into which this one is viewing.
+     *  This pointer is never followed, except during Py_DECREF when we do not need it any longer.
+     */
+    PyObject * base;
+
+    /* Type-specific fields go here. */
+    //GpuTensorType::VoidTensor * vt;
+    int nd; //the number of dimensions of the tensor
+    int * host_structure; //dim0, dim1, ... stride0, stride1, ...
+    int data_allocated; //the number of bytes allocated for devdata
+
+    //device pointers (allocated by cudaMalloc)
+    int dev_structure_fresh;
+    int * dev_structure; //dim0, dim1, ..., stride0, stride1, ...
+    real* devdata; //pointer to data element [0,..,0].
+};
+
+/*
+ * Return a CudaNdarray whose 'nd' dimensions are all 0.
+ */
+PyObject * 
+CudaNdarray_New(int nd);
+
+/**
+ * Return 1 for a CudaNdarray otw 0
+ */
+int 
+CudaNdarray_Check(const PyObject * ob);
+
+/**
+ * Return 1 for a CudaNdarray otw 0
+ */
+int 
+CudaNdarray_CheckExact(const PyObject * ob);
+
+/****
+ * Returns the number of elements necessary in host_structure and dev_structure for a given number of dimensions.
+ */
+int 
+cnda_structure_size(int nd)
+{
+    // dim0, dim1, ...
+    // str0, str1, ...
+    // log2(dim0), log2(dim1), ...
+    return nd + nd + nd;
+}
+
+const int * 
+CudaNdarray_HOST_DIMS(const CudaNdarray * self)
+{
+    return self->host_structure;
+}
+const int * 
+CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
+{
+    return self->host_structure + self->nd;
+}
+const int * 
+CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
+{
+    return self->host_structure + 2*self->nd;
+}
+
+void 
+cnda_mark_dev_structure_dirty(CudaNdarray * self)
+{
+    self->dev_structure_fresh = 0;
+}
+/****
+ *  Set the idx'th dimension to value d.
+ *
+ *  Updates the log2dim shaddow array.
+ *
+ *  Does not sync structure to host.
+ */
+void 
+CudaNdarray_set_dim(CudaNdarray * self, int idx, int d)
+{
+    if ((idx >= self->nd) || (idx < 0) || (d < 0))
+    {
+        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_dim arguments: %i %i\n", idx, d);
+    }
+
+    if (d != self->host_structure[idx])
+    {
+        self->host_structure[idx] = d;
+        int log2d = (int)log2((double)d);
+        self->host_structure[idx + 2*self->nd] = (d == (1 << log2d)) ? log2d : -1;
+        cnda_mark_dev_structure_dirty(self);
+    }
+}
+void 
+CudaNdarray_set_stride(CudaNdarray * self, int idx, int s)
+{
+    if ((idx >= self->nd) || (idx < 0))
+    {
+        fprintf(stderr, "WARNING: probably bad CudaNdarray_set_stride arguments: %i %i\n", idx, s);
+    }
+
+    if (s != CudaNdarray_HOST_STRIDES(self)[idx])
+    {
+        self->host_structure[idx+self->nd] = s;
+        cnda_mark_dev_structure_dirty(self);
+    }
+}
+/***
+ *  Update dependent variables from the contents of CudaNdarray_HOST_DIMS(self) and CudaNdarray_HOST_STRIDES(self)
+ *
+ *  This means: recalculate the log2dims and transfer structure to the card
+ */
+int 
+cnda_copy_structure_to_device(CudaNdarray * self)
+{
+    cublasSetVector(cnda_structure_size(self->nd), sizeof(int), self->host_structure, 1, self->dev_structure, 1);
+    CNDA_THREAD_SYNC;
+    if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+    {
+        PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
+        return -1;
+    }
+    self->dev_structure_fresh = 1;
+    return 0;
+}
+
+const int * 
+CudaNdarray_DEV_DIMS(CudaNdarray * self)
+{
+    if (!self->dev_structure_fresh)
+    {
+        if (cnda_copy_structure_to_device(self))
+            return NULL;
+    }
+    return self->dev_structure;
+}
+const int * 
+CudaNdarray_DEV_STRIDES(CudaNdarray * self)
+{
+    if (!self->dev_structure_fresh)
+    {
+        if (cnda_copy_structure_to_device(self))
+            return NULL;
+    }
+    return self->dev_structure + self->nd;
+}
+const int * 
+CudaNdarray_DEV_LOG2DIMS(CudaNdarray * self)
+{
+    if (!self->dev_structure_fresh)
+    {
+        if (cnda_copy_structure_to_device(self))
+            return NULL;
+    }
+    return self->dev_structure + 2*self->nd;
+}
+float * 
+CudaNdarray_DEV_DATA(const CudaNdarray * self)
+{
+    return self->devdata;
+}
+
+/**
+ * Return the number of elements in the ndarray (product of the dimensions)
+ */
+int 
+CudaNdarray_SIZE(const CudaNdarray *self)
+{
+    if (self->nd == -1) return 0;
+    int size = 1;
+    for (int i = 0; i < self->nd; ++i)
+    {
+        size *= CudaNdarray_HOST_DIMS(self)[i];
+    }
+    return size;
+}
+
+
+
+/**
+ * Allocate a new CudaNdarray with nd==-1
+ */
+PyObject * CudaNdarray_new_null();
+
+/**
+ * Allocate a new CudaNdarray with room for given number of dimensions
+ *
+ * No Storage space is allocated (and all dimensions are 0)
+ */
+PyObject * CudaNdarray_new_nd(const int nd);
+
+/**
+ * [Re]allocate a CudaNdarray with access to 'nd' dimensions.
+ *
+ * Note: This does not allocate storage for data.
+ */
+int CudaNdarray_set_nd(CudaNdarray * self, const int nd)
+{
+    if (nd != self->nd)
+    {
+        if (self->dev_structure)
+        {
+            cublasFree(self->dev_structure);
+            if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+            {
+                PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
+                return -1;
+            }
+            self->dev_structure = NULL;
+        }
+        if (self->host_structure) 
+        {
+            free(self->host_structure);
+            self->host_structure = NULL;
+            self->nd = -1;
+        }
+        if (nd == -1) return 0;
+
+        self->host_structure = (int*)malloc(cnda_structure_size(nd)*sizeof(int));
+        //initialize all dimensions and strides to 0
+        for (int i = 0; i < cnda_structure_size(nd); ++i) self->host_structure[i] = 0;
+        if (NULL == self->host_structure)
+        {
+            PyErr_SetString(PyExc_MemoryError, "Failed to allocate dim or str");
+            return -1;
+        }
+        cublasAlloc(cnda_structure_size(nd), sizeof(int), (void**)&self->dev_structure);
+        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        {
+            PyErr_SetString(PyExc_MemoryError, "error allocating device memory");
+            free(self->host_structure);
+            self->host_structure = NULL;
+            self->dev_structure = NULL;
+            return -1;
+        }
+        self->nd = nd;
+        self->dev_structure_fresh = 0;
+    }
+    return 0;
+}
+
+/**
+ * CudaNdarray_alloc_contiguous
+ *
+ * Allocate storage space for a tensor of rank 'nd' and given dimensions.
+ *
+ * Note: CudaNdarray_alloc_contiguous is templated to work for both int dimensions and npy_intp dimensions
+ */
+template<typename inttype>
+int CudaNdarray_alloc_contiguous(CudaNdarray *self, const int nd, const inttype * dim)
+{
+    // allocate an empty ndarray with c_contiguous access
+    // return 0 on success
+    int size = 1; //set up the strides for contiguous tensor
+    assert (nd >= 0);
+    if (CudaNdarray_set_nd(self, nd))
+    {
+        return -1;
+    }
+    //TODO: check if by any chance our current dims are correct,
+    //      and strides already contiguous
+    //      in that case we can return right here.
+    for (int i = nd-1; i >= 0; --i)
+    {
+        CudaNdarray_set_stride(self, i, (dim[i] == 1) ? 0 : size);
+        CudaNdarray_set_dim(self, i, dim[i]);
+        size = size * dim[i];
+    }
+
+    if (self->data_allocated != size)
+    {
+        //std::cerr << "resizing from  " << self->data_allocated << " to " << size << '\n';
+        cublasFree(self->devdata);
+        if (CUBLAS_STATUS_SUCCESS != cublasGetError())
+        {// Does this ever happen??  Do we need to set data_allocated or devdata to 0?
+            PyErr_SetString(PyExc_MemoryError, "error freeing device memory");
+            return -1;
+        }
+        assert(size>0);
+        cublasAlloc(size, sizeof(real), (void**)&(self->devdata));
+        //std::cerr << "cublasAlloc returned " << self->devdata << "\n";
+        //We must do both checks as the first one is not enough in some cases!
+        if (CUBLAS_STATUS_SUCCESS != cublasGetError() || !self->devdata)
+        {
+            PyErr_Format(PyExc_MemoryError, "error allocating %i bytes device memory",size);
+            CudaNdarray_set_nd(self,-1);
+            self->data_allocated = 0;
+            self->devdata = 0;
+            return -1;
+        }
+        self->data_allocated = size;
+    }
+    return 0;
+}
+
+/*
+ * Return a CudaNdarray whose 'nd' dimensions are set to dims, and allocated.
+ */
+template<typename inttype>
+PyObject * 
+CudaNdarray_NewDims(int nd, const inttype * dims)
+{
+    CudaNdarray * rval = (CudaNdarray*)CudaNdarray_new_null();
+    if (rval)
+    {
+        if (CudaNdarray_alloc_contiguous(rval, nd, dims))
+        {
+            Py_DECREF(rval);
+            return NULL;
+        }
+    }
+    return (PyObject*)rval;
+}
+
+
+/**
+ * CudaNdarray_set_device_data
+ *
+ * Set self to be a view of given `data`, owned by existing CudaNdarray `base`.
+ */
+int CudaNdarray_set_device_data(CudaNdarray * self, float * data, CudaNdarray * base);
+
+/**
+ * Return an independent copy of self
+ */
+PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo);
+
+/**
+ * Return an independent copy of self
+ */
+PyObject * CudaNdarray_Copy(CudaNdarray * self);
+
+/**
+ * Return a new object obtained by summing over the dimensions for which there is a 1 in the mask.
+ */
+PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask);
+
+/**
+ * Transfer the contents of numpy array `obj` to `self`.
+ *
+ * self is reallocated to have the correct dimensions if necessary.
+ */
+int CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj);
+
+/**
+ * Transfer the contents of CudaNdarray `other` to `self`.
+ *
+ * self is reallocated to have the correct dimensions if necessary.
+ */
+int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self, CudaNdarray * other);
+
+/**
+ * Transfer the contents of CudaNdarray `self` to a new numpy ndarray.
+ */
+PyObject * 
+CudaNdarray_CreateArrayObj(CudaNdarray * self);
+
+/**
+ * True iff the strides look like [dim[nd-2], dim[nd-3], ... , dim[0], 1]
+ */
+bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
+
+int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
+
+
+int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
+int CudaNdarray_reduce_prod(CudaNdarray * self, CudaNdarray * A);
+int CudaNdarray_reduce_min(CudaNdarray * self, CudaNdarray * A);
+int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
+
+int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
+
+enum { ConvMode_FULL, ConvMode_VALID };
+PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
+PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode)
+{
+    return CudaNdarray_Conv(img, kern, out, mode, 1, 1, -1, 0);
+}
+int CudaNdarray_conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode);
+
+void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
+{
+    fprintf(fd, "CudaNdarray <%p, %p> nd=%i \n", self, self->devdata, self->nd);
+    fprintf(fd, "\tHOST_DIMS:      ");
+    for (int i = 0; i < self->nd; ++i)
+    {
+        fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
+    }
+    fprintf(fd, "\n\tHOST_STRIDES: ");
+    for (int i = 0; i < self->nd; ++i)
+    {
+        fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
+    }
+    fprintf(fd, "\n");
+}
+
+#endif
--- a/theano/sandbox/cuda/nnet.py
+++ b/theano/sandbox/cuda/nnet.py
@@ -2,7 +2,6 @@ from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar
 import StringIO

-import cuda_ndarray
 from theano.sandbox.cuda.type import CudaNdarrayType

 from theano.sandbox.cuda.kernel_codegen import nvcc_kernel, inline_reduce_max, inline_reduce_sum, inline_softmax

--- a/theano/sandbox/cuda/nvcc_compiler.py
+++ b/theano/sandbox/cuda/nvcc_compiler.py
@@ -72,6 +72,8 @@ def nvcc_module_compile_str(module_name, src_code, location=None, include_dirs=[
    cmd.extend('-I%s'%idir for idir in include_dirs)
    cmd.extend(['-o',lib_filename]) 
    cmd.append(cppfilename)
+    if module_name != 'cuda_ndarray':
+        cmd.append(os.path.join(os.path.split(cppfilename)[0],'..','cuda_ndarray','cuda_ndarray.so'))
    cmd.extend(['-L%s'%ldir for ldir in lib_dirs])
    cmd.extend(['-l%s'%l for l in libs])
    debug('Running cmd', ' '.join(cmd))

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -11,12 +11,12 @@ import theano.tensor as T
 # Skip test if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 try:
-    import cuda_ndarray
+    import cuda_ndarray.cuda_ndarray as cuda_ndarray
 except ImportError:
    raise SkipTest('Optional package cuda_ndarray not available')

 import theano.sandbox.cuda as tcn
-import cuda_ndarray as cuda
+import theano.sandbox.cuda as cuda
 import theano.compile.mode
 from theano.tests import unittest_tools as utt


--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+import sys, time
+import numpy
+import theano.sandbox.cuda as cuda_ndarray
+
+def py_conv_valid_numpy(img, kern):
+    assert img.shape[1] == kern.shape[1]
+    outshp = (img.shape[0], kern.shape[0], 
+            img.shape[2] - kern.shape[2] + 1,
+            img.shape[3] - kern.shape[3] + 1)
+    out = numpy.zeros(outshp, dtype='float32')
+    for b in xrange(out.shape[0]):
+        for k in xrange(out.shape[1]):
+            for rr in xrange(out.shape[2]):
+                for cc in xrange(out.shape[3]):
+                    #rr, cc is the upper-left corner of img patches
+                    imgpatch = img[b,:,rr:rr+kern.shape[2], cc:cc+kern.shape[3]]
+                    #print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1
+                    innerprod = (imgpatch[:,::-1,::-1] * kern[k,:,:,:]).sum()
+                    out[b, k, rr, cc] = innerprod
+    return out
+def py_conv_full_numpy(img, kern):
+    # manually pad the img with zeros all around, and then run it through py_conv_valid
+    pad_rows = 2*(kern.shape[2]-1) + img.shape[2]
+    pad_cols = 2*(kern.shape[3]-1) + img.shape[3]
+    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype)
+    padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img
+    return py_conv_valid(padded_img, kern)
+def py_conv_scipy(img, kern, mode, subsample):
+    from scipy.signal import convolve2d
+    assert img.shape[1] == kern.shape[1]
+    if mode == 'valid':
+        outshp = (img.shape[0], kern.shape[0], 
+                img.shape[2] - kern.shape[2] + 1,
+                img.shape[3] - kern.shape[3] + 1)
+    else:
+        outshp = (img.shape[0], kern.shape[0], 
+                img.shape[2] + kern.shape[2] - 1,
+                img.shape[3] + kern.shape[3] - 1)
+    out = numpy.zeros(outshp, dtype='float32')
+    for b in xrange(out.shape[0]):
+        for k in xrange(out.shape[1]):
+            for s in xrange(img.shape[1]):
+                out[b,k,:,:] += convolve2d(img[b,s,:,:]
+                        , kern[k,s,:,:]
+                        , mode)
+    return out[:,:,::subsample[0], ::subsample[1]]
+
+def _params_allgood_header():
+    print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
+
+def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False):
+    if ones:
+        assert not random
+        npy_img = numpy.asarray(numpy.ones(ishape), dtype='float32')
+        npy_kern = -numpy.asarray(numpy.ones(kshape), dtype='float32')
+    elif random:
+        npy_img = numpy.asarray(numpy.random.rand(*ishape), dtype='float32')
+        npy_kern = numpy.asarray(numpy.random.rand(*kshape), dtype='float32')
+    else:
+        npy_img = numpy.asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1
+        npy_kern = -(numpy.asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1)
+
+    img = cuda_ndarray.CudaNdarray(npy_img)
+    kern = cuda_ndarray.CudaNdarray(npy_kern)
+
+    #we take the stride after the transfert as we make c_contiguous data on the GPU.
+    img=img[:,:,::img_stride[0],::img_stride[1]]
+    kern=kern[:,:,::kern_stride[0],::kern_stride[1]]
+    npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]]
+    npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]]
+
+    t2 = None
+    rval = True
+    try:
+        t0 = time.time()
+        cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
+        t1 = time.time()
+        gpuval = cuda_ndarray.conv(img, kern, mode, subsample=subsample,
+                                   version=version, verbose=verbose)
+        t2 = time.time()
+        for i in range(nb_iter):
+            gpuval2 = cuda_ndarray.conv(img, kern, mode, subsample=subsample,
+                                        version=version, verbose=0)
+            assert numpy.allclose(numpy.asarray(gpuval),numpy.asarray(gpuval2))
+            assert (numpy.asarray(gpuval)==numpy.asarray(gpuval2)).all()
+        gpuval = numpy.asarray(gpuval)
+        if gpuval.shape != cpuval.shape:
+            print >> sys.stdout, "ERROR: shape mismatch", gpuval.shape, cpuval.shape
+            rval = False
+        if rval:
+            rval = numpy.allclose(cpuval, gpuval, rtol = rtol)
+    except NotImplementedError, e:
+        print >> sys.stdout, '_params_allgood Failed allclose', e
+        rval = False
+
+    if (t2 is not None):
+        if mode == 'valid':
+            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
+        else:
+            approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2
+        approx_fp /= 1e6
+        cpu_mflops = approx_fp / (t1-t0)
+        gpu_mflops = approx_fp / (t2-t1)
+        print >> sys.stdout, '%15s'% str(ishape), '%15s'% str(kshape),
+	print >> sys.stdout, '%12.5f  %7.2f %7.2f %7.1f' % (approx_fp, 
+		cpu_mflops, gpu_mflops,(t1-t0)/(t2-t1))
+    if not rval:
+        print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version
+        diff=cpuval-gpuval
+        diffabs=numpy.absolute(diff)
+        pr_diff=diffabs/numpy.absolute(cpuval)
+        nb_close=(diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
+        print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
+        print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
+        print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
+
+	print rval
+    if not rval and print_!=False:
+        if npy_img.shape[0]>5:
+            print "img",npy_img[0]
+            print "kern",npy_kern[0]
+            print "gpu",gpuval[0][0]
+            print "cpu",cpuval[0][0]
+            print "diff",diff[0][0]
+        else:
+            print "img",npy_img
+            print "kern",npy_kern
+            print "gpu",gpuval
+            print "cpu",cpuval
+            print "diff",diff
+                
+    return rval
+
+def exec_conv(version, shapes, verbose, random, mode, print_=None, rtol=1e-5, ones=False):
+    _params_allgood_header()
+    nb_failed = 0
+    nb_tests = 0
+
+    failed_version=set()
+    failed_id=[]
+    for ver in version:# I put -1 in case we forget to add version in the test to.
+        for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
+            ret=False
+            try:
+                ret = _params_allgood(ishape, kshape, mode,
+                                      subsample=subshape, img_stride=istride, kern_stride=kstride,
+                                      version=ver, verbose=verbose, random=random, id=id,print_=print_,rtol=rtol,ones=ones)
+            except:
+                pass
+            if not ret:
+                failed_version.add(ver)
+                failed_id.append(id)
+                nb_failed+=1
+            nb_tests+=1
+    if nb_failed>0:
+        print "nb_failed",nb_failed,"on",nb_tests, "failed_version",failed_version, "failed_id",failed_id
+        assert nb_failed==0
+    else:
+        print 'Executed',nb_tests,'different shapes'
+
+def get_basic_shapes():
+    return [
+	#basic test of image and kernel shape
+	      ((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
+            , ((1, 1, 2, 2), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 3, 3), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+        #basic test for unsquare kernel and image
+            , ((1, 1, 2, 4), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 3, 4), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 4, 3), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))]
+
+def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
+    """ all possible case if we one or more of stack size, batch size, nkern. We use the gived image shape, kernel shape and subsmaple shape."""
+    return [  ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only
+            , ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only
+            , ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only
+            , ((3, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#batch and nkern
+            , ((3, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#batch and stack
+            , ((1, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#stack and nkern
+            , ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
+            , ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
+            ]
+def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
+    #basic test of stack, batch and nkern paramter
+    shapes =get_shapes((1*scales_img[0],1*scales_img[1]),
+                       (1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with image and kernel shape 
+    shapes +=get_shapes((2*scales_img[0],2*scales_img[1]),
+                        (2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with image and kernel shape
+    shapes +=get_shapes((3*scales_img[0],3*scales_img[1]),
+                        (2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square image.
+    shapes +=get_shapes((4*scales_img[0],3*scales_img[1]),
+                        (2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square image.
+    shapes +=get_shapes((3*scales_img[0],4*scales_img[1]),
+                        (2*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square kernel.
+    shapes +=get_shapes((4*scales_img[0],4*scales_img[1]),
+                        (3*scales_kern[0],2*scales_kern[1]),subsample, img_stride, kern_stride)
+    #basic test of stack, batch and nkern paramter with not square kernel.
+    shapes +=get_shapes((4*scales_img[0],4*scales_img[1]),
+                        (2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride)
+    return shapes
+
+def test_valid():
+    #          img shape,     kern shape, subsample shape
+
+    shapes = get_basic_shapes()
+    shapes +=get_shapes2()
+
+    #test image stride
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2))
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1))
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2))
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
+    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
+
+    #test subsample
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
+
+    shapes += [
+         #other test
+              ((2, 1, 2, 2), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((3, 2, 4, 4), (4, 2, 4, 4), (1,1), (1,1), (1,1))
+            , ((4, 1, 10, 10), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))
+            , ((4, 1, 10, 10), (1, 1, 2, 3), (1,1), (1,1), (1,1))
+            , ((4, 1, 10, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
+            , ((4, 1, 20, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
+            , ((3, 2, 8, 8), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize
+            , ((3, 2, 8, 6), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image
+            , ((3, 2, 8, 6), (4, 2, 4, 3), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image, non-square kern
+            , ((3, 2, 8, 6), (4, 2, 4, 6), (1,1), (1,1), (1,1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+            , ((16, 5, 64, 64), (8, 5, 8, 8), (1,1), (1,1), (1,1)) # a big one
+            , ((16, 1, 28, 28), (20, 1, 5, 5), (1,1), (1,1), (1,1)) # MNIST LeNET layer 1
+            , ((20, 16, 32, 32), (1, 16, 28, 28), (1,1), (1,1), (1,1)) # layer 1 backprop to weights
+            , ((60,20,28,28), (10,20,5,5), (1,1), (2,2), (1,1))#added a test case that fail from test_nnet.py.test_conv_nnet2
+            , ((10,5,28,28), (10,5,5,5), (1,1), (2,2), (1,1))#test precedent but reduced that triger the error
+            ]
+
+    shapes += [ ((60,1,28,28),(20,1,5,5), (1,1), (1,1), (1,1))#test_lenet_28 1 layers
+            , ((60,20,12,12),(30,20,5,5), (1,1), (1,1), (1,1))#test_lenet_28 2 layers
+            , ((60,30,8,8),(20,30,5,5), (1,1), (1,1), (1,1))#test_lenet_28 bprop 1 full
+            , ((20,60,12,12),(30,60,8,8), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
+            , ((1,60,28,28),(20,60,24,24), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
+            , ((10,1,64,64),(20,1,7,7), (1,1), (1,1), (1,1))#test_lenet_64 1 layers
+            , ((10,20,29,29),(30,20,7,7), (1,1), (1,1), (1,1))#test_lenet_64 2 layers
+            , ((10,30,23,23),(20,30,7,7), (1,1), (1,1), (1,1))#test_lenet_64 full
+            , ((20,10,29,29),(30,10,23,23), (1,1), (1,1), (1,1))#test_lenet_64 bprop 1
+            , ((1,10,64,64),(20,10,58,58), (1,1), (1,1), (1,1))#test_lenet_64 bprop 2
+            ]
+
+    shapes=shapes[425:426]
+    # I put -1 in case we forget to add version in the test to.
+    # I put -2 to test the reference version.
+    version=[-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+    verbose=0
+#    version=[1]
+    
+    random = True
+    print_ = True
+    ones = False
+    if ones:
+        random = False
+    
+    exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5)
+
+def test_full():
+    shapes = get_basic_shapes()
+    shapes +=get_shapes2()
+    #test image stride
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2))
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1))
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2))
+    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
+    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
+
+    #test subsample
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
+
+    shapes += [
+        #other test
+              ((2, 1, 2, 2), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((3, 2, 4, 4), (4, 2, 4, 4), (1,1), (1,1), (1,1))
+            , ((4, 1, 10, 10), (1, 1, 2, 2), (1,1), (1,1), (1,1))
+            , ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))
+            , ((4, 1, 10, 10), (1, 1, 2, 3), (1,1), (1,1), (1,1))
+            , ((4, 1, 10, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
+            , ((4, 1, 20, 10), (1, 1, 2, 10), (1,1), (1,1), (1,1))
+            , ((3, 2, 8, 8), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize
+            , ((3, 2, 8, 6), (4, 2, 4, 4), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image
+            , ((3, 2, 8, 6), (4, 2, 4, 3), (1,1), (1,1), (1,1)) #stack, nkern, bsize, non-square image, non-square kern
+            , ((3, 2, 8, 6), (4, 2, 4, 6), (1,1), (1,1), (1,1)) #stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
+            , ((16, 5, 64, 64), (8, 5, 8, 8), (1,1), (1,1), (1,1)) # a big one
+            , ((16, 1, 28, 28), (20, 1, 5, 5), (1,1), (1,1), (1,1)) # MNIST LeNET layer 1
+            , ((20, 16, 32, 32), (1, 16, 28, 28), (1,1), (1,1), (1,1)) # layer 1 backprop to weights
+
+        #other test
+            , ((3, 1, 1, 1), (2, 1, 5, 3), (1,1), (1,1), (1,1))#kernel bigger then image
+            , ((3, 2, 1, 1), (4, 2, 1, 1), (1,1), (1,1), (1,1))
+            , ((3, 2, 4, 4), (4, 2, 2, 6), (1,1), (1,1), (1,1))
+            , ((3, 2, 4, 4), (4, 2, 8, 6), (1,1), (1,1), (1,1))#kernel bigger then image
+            , ((4, 2, 10, 10), (3, 2, 2, 12), (1,1), (1,1), (1,1))
+            ]
+    shapes += [
+#        ((60,1,28,28),(20,1,5,5), (1,1), (1,1), (1,1))#test_lenet_28 1 layers
+#            , ((60,20,12,12),(30,20,5,5), (1,1), (1,1), (1,1))#test_lenet_28 2 layers
+             ((60,30,8,8),(20,30,5,5), (1,1), (1,1), (1,1))#test_lenet_28 bprop 1 full
+#            , ((20,60,12,12),(30,60,8,8), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
+#            , ((1,60,28,28),(20,60,24,24), (1,1), (1,1), (1,1))#test_lenet_28 bprop 2 valid
+#            , ((10,1,64,64),(20,1,7,7), (1,1), (1,1), (1,1))#test_lenet_64 1 layers
+#            , ((10,20,29,29),(30,20,7,7), (1,1), (1,1), (1,1))#test_lenet_64 2 layers
+            , ((10,30,23,23),(20,30,7,7), (1,1), (1,1), (1,1))#test_lenet_64 full
+#            , ((20,10,29,29),(30,10,23,23), (1,1), (1,1), (1,1))#test_lenet_64 bprop 1
+#            , ((1,10,64,64),(20,10,58,58), (1,1), (1,1), (1,1))#test_lenet_64 bprop 2
+            ]
+
+#    shapes=shapes[:277]
+    version=[-2,-1,0,1,2,3,4,5]
+    verbose=0
+#    version=[4]
+    random=True
+
+    exec_conv(version, shapes, verbose, random, 'full')
+
+def test_subsample():
+    # implement when
+    shapes = [ 
+            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1))
+            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1))
+            ]
+    all_good = True
+
+    _params_allgood_header()
+    for ishape, kshape, ds in shapes:
+        if not _params_allgood(ishape, kshape, 'full', subsample=ds):
+            all_good = False
+        if not _params_allgood(ishape, kshape, 'valid', subsample=ds):
+            all_good = False
+    assert all_good
+
+def test_logical_shapes():
+    # implement when
+    print >> sys.stderr, "INFO: test_logical_shapes not implemented (i.e. imshp_logical, kshp_logical, kshp_logical_top_aligned)"
+
+
+def _test_dummy():
+    ishape = (1, 1, 5, 5)
+    kshape = (1, 1, 3, 3)
+    mode = 'valid'
+    subsample = (1,1)
+
+    npy_img = numpy.asarray(numpy.random.rand(*ishape), dtype='float32')
+    npy_kern = numpy.asarray(numpy.random.rand(*kshape), dtype='float32')
+
+    img = cuda_ndarray.CudaNdarray(npy_img)
+    kern = cuda_ndarray.CudaNdarray(npy_kern)
+
+    #print >> sys.stdout, '_params_allgood trying ', ishape, kshape, mode
+    t2 = None
+    rval = True
+
+    t0 = time.time()
+    cpuval = py_conv_scipy(npy_img, npy_kern, mode, subsample)
+    t1 = time.time()
+    gpuval = cuda_ndarray.conv(img, kern, mode, subsample)
+    t2 = time.time()
+    gpuval = numpy.asarray(gpuval)
+    print gpuval
+    print cpuval
+
+
+def benchmark():
+    
+    shapes_valid = [
+        #test_lenet_28 shape
+        ((20, 60,12,12), (30,60,8,8), (1,1), (1,1), (1,1))#valid
+        ,((60, 20,12,12), (30,20,5,5), (1,1), (1,1), (1,1))#valid
+        ,((60, 1,28,28), (20,1,5,5), (1,1), (1,1), (1,1))#valid
+        ,((1, 60,28,28), (20,60,24,24), (1,1), (1,1), (1,1))#valid
+        #test_lenet_32 shape
+        ,((20, 60,14,14), (30,60,10,10), (1,1), (1,1), (1,1))#valid
+        ,((60, 20,14,14), (30,20,5,5), (1,1), (1,1), (1,1))#valid
+        ,((60, 1,32,32), (20,1,5,5), (1,1), (1,1), (1,1))#valid
+        ,((1, 60,32,32), (20,60,28,28), (1,1), (1,1), (1,1))#valid
+        #test_lenet_64 shape
+        ,((10, 20,29,29), (30,20,7,7), (1,1), (1,1), (1,1))#valid
+        ,((20, 10,29,29), (30,10,23,23), (1,1), (1,1), (1,1))#valid
+        ,((10, 1,64,64), (20,1,7,7), (1,1), (1,1), (1,1))#valid
+        ,((1, 10,64,64), (20,10,58,58), (1,1), (1,1), (1,1))#valid
+        #test_lenet_108 shape
+        ,((10, 20,51,51), (30,20,7,7), (1,1), (1,1), (1,1))#valid
+        ,((20, 10,51,51), (30,10,45,45), (1,1), (1,1), (1,1))#valid
+        ,((10, 1,108,108), (20,1,7,7), (1,1), (1,1), (1,1))#valid
+        ,((1, 10,108,108), (20,10,102,102), (1,1), (1,1), (1,1))#valid
+        #test_lenet_256 shape
+        ,((2, 20,124,124), (30,20,9,9), (1,1), (1,1), (1,1))#valid 
+        ,((20, 2,124,124), (30,2,116,116), (1,1), (1,1), (1,1))#valid
+        ,((2, 1,256,256), (20,1,9,9), (1,1), (1,1), (1,1))#valid
+        ,((1, 2,256,256), (20,2,248,248), (1,1), (1,1), (1,1))#valid
+            ]
+
+    shapes_full = [
+        #test_lenet_28 shape
+         ((60, 30,8,8), (20, 30, 5, 5), (1,1), (1,1), (1,1))#full
+        #test_lenet_32 shape
+         ,((60, 30,10,10), (20, 30, 5, 5), (1,1), (1,1), (1,1))#full conv_full_patch_stack_padded' N=1
+        #test_lenet_64 shape
+         ,((10, 30,23,23), (20, 30, 7, 7), (1,1), (1,1), (1,1))#full conv_full_patch_stack_padded' N=3
+        #test_lenet_108 shape
+         ,((10, 30,45,45), (20, 30, 7, 7), (1,1), (1,1), (1,1))#full 'conv_full_patch_stack_padded' N=9
+        #test_lenet_256 shape
+         ,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full
+            ]
+
+
+#    shapes_valid=shapes_valid[-1:]
+#    shapes_full=shapes_full[-1:]
+    version=[-1]
+    verbose=1
+    random=True
+
+    exec_conv(version, shapes_valid, verbose, random, 'valid', print_=None, rtol=1e-3)
+    exec_conv(version, shapes_full, verbose, random, 'full')
--- a/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_cuda_ndarray.py
+import time, copy, sys
+import theano.sandbox.cuda as cuda_ndarray
+import numpy
+
+def test_host_to_device():
+    print >>sys.stderr, 'starting test_host_to_dev'
+    for shape in ((), (3,), (2,3), (3,4,5,6)):
+        a = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
+        b = cuda_ndarray.CudaNdarray(a)
+        c = numpy.asarray(b)
+        assert numpy.all(a == c)
+
+def test_add():
+    for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
+        a0 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
+        a1 = a0.copy()
+        b0 = cuda_ndarray.CudaNdarray(a0)
+        b1 = cuda_ndarray.CudaNdarray(a1)
+        t0 = time.time()
+        bsum = b0 + b1
+        bsum = b0 + b1
+        t1 = time.time()
+        gpu_dt = t1 - t0
+        t0 = time.time()
+        asum = a0 + a1
+        asum = a0 + a1
+        t1 = time.time()
+        cpu_dt = t1 - t0
+        print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', cpu_dt / gpu_dt
+        assert numpy.allclose(asum,  numpy.asarray(bsum))
+
+        if len(shape)==2:
+            #test not contiguous version.
+            #should raise not implemented.
+            _b = b0[::, ::-1]
+
+            b = numpy.asarray(_b)
+
+            ones = numpy.ones(shape, dtype='float32')
+            _ones = cuda_ndarray.CudaNdarray(ones)
+            t = False
+            try:
+                _c = _b+_ones
+            except:
+                t = True
+            assert t
+
+
+
+def test_exp():
+    print >>sys.stderr, 'starting test_exp'
+    for shape in ((), (3,), (2,3), (1,10000000),(10,1000000), (100,100000),(1000,10000),(10000,1000)):
+        a0 = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
+        a1 = a0.copy()
+        b0 = cuda_ndarray.CudaNdarray(a0)
+        b1 = cuda_ndarray.CudaNdarray(a1)
+        t0 = time.time()
+        bsum = b0.exp()
+        t1 = time.time()
+        gpu_dt = t1 - t0
+        t0 = time.time()
+        asum = numpy.exp(a1)
+        t1 = time.time()
+        cpu_dt = t1 - t0
+        print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', cpu_dt / gpu_dt
+        #c = numpy.asarray(b0+b1)
+        if asum.shape:
+            assert numpy.allclose(asum, numpy.asarray(bsum))
+
+
+def test_copy():
+    print >>sys.stderr, 'starting test_copy'
+    shape = (5,)
+    a = numpy.asarray(numpy.random.rand(*shape), dtype='float32')
+
+    print >>sys.stderr, '.. creating device object'
+    b = cuda_ndarray.CudaNdarray(a)
+
+    print >>sys.stderr, '.. copy'
+    c = copy.copy(b)
+    print >>sys.stderr, '.. deepcopy'
+    d = copy.deepcopy(b)
+
+    print >>sys.stderr, '.. comparisons'
+    assert numpy.allclose(a, numpy.asarray(b))
+    assert numpy.allclose(a, numpy.asarray(c))
+    assert numpy.allclose(a, numpy.asarray(d))
+
+def test_dot():
+    print >>sys.stderr, 'starting test_dot'
+    a0 = numpy.asarray(numpy.random.rand(4, 7), dtype='float32')
+    a1 = numpy.asarray(numpy.random.rand(7, 6), dtype='float32')
+
+    b0 = cuda_ndarray.CudaNdarray(a0)
+    b1 = cuda_ndarray.CudaNdarray(a1)
+
+    assert numpy.allclose(numpy.dot(a0, a1), cuda_ndarray.dot(b0, b1))
+
+    print >> sys.stderr, 'WARNING test_dot: not testing all 8 transpose cases of dot'
+
+def test_sum():
+    shape = (2,3)
+    a0 = numpy.asarray(numpy.arange(shape[0]*shape[1]).reshape(shape), dtype='float32')
+
+    b0 = cuda_ndarray.CudaNdarray(a0)
+
+    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
+
+    a0sum = a0.sum(axis=0)
+    b0sum = b0.reduce_sum([1,0])
+
+    print 'asum\n',a0sum
+    print 'bsum\n',numpy.asarray(b0sum)
+
+    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
+    assert numpy.allclose(a0.sum(axis=1), numpy.asarray(b0.reduce_sum([0,1])))
+    assert numpy.allclose(a0, numpy.asarray(b0.reduce_sum([0,0])))
+
+    shape = (3,4,5,6,7,8)
+    a0 = numpy.asarray(numpy.arange(3*4*5*6*7*8).reshape(shape), dtype='float32')
+    b0 = cuda_ndarray.CudaNdarray(a0)
+    assert numpy.allclose(a0.sum(axis=5).sum(axis=3).sum(axis=0), numpy.asarray(b0.reduce_sum([1,0,0,1,0,1])))
+
+    shape = (16,2048)
+    a0 = numpy.asarray(numpy.arange(16*2048).reshape(shape), dtype='float32')
+    b0 = cuda_ndarray.CudaNdarray(a0)
+    assert numpy.allclose(a0.sum(axis=0), numpy.asarray(b0.reduce_sum([1,0])))
+
+    shape = (16,10)
+    a0 = numpy.asarray(numpy.arange(160).reshape(shape), dtype='float32')
+    b0 = cuda_ndarray.CudaNdarray(a0)
+    assert numpy.allclose(a0.sum(), numpy.asarray(b0.reduce_sum([1,1])))
+
+def test_reshape():
+    shapelist = [
+            ((1,2,3), (1,2,3)),
+            ((1,), (1,)),
+            ((1,2,3), (3,2,1)),
+            ((1,2,3), (6,)),
+            ((1,2,3,2), (6,2)),
+            ((2,3,2), (6,2))
+             ]
+
+    def subtest(shape_1, shape_2):
+        #print >> sys.stderr, "INFO: shapes", shape_1, shape_2
+        a = numpy.asarray(numpy.random.rand(*shape_1), dtype='float32')
+        b = cuda_ndarray.CudaNdarray(a)
+
+        aa = a.reshape(shape_2)
+        bb = b.reshape(shape_2)
+
+        n_bb = numpy.asarray(bb)
+
+        #print n_bb
+
+        assert numpy.all(aa == n_bb)
+   
+    # test working shapes
+    for shape_1, shape_2 in shapelist:
+        subtest(shape_1, shape_2)
+        subtest(shape_2, shape_1)
+
+    print >> sys.stderr, "WARN: TODO: test shape combinations that should give error"
+
+
+def test_getshape():
+    shapelist = [
+            ((1,2,3), (1,2,3)),
+            ((1,), (1,)),
+            ((1,2,3), (3,2,1)),
+            ((1,2,3), (6,)),
+            ((1,2,3,2), (6,2)),
+            ((2,3,2), (6,2))
+             ]
+
+    def subtest(shape):
+        a = numpy.asarray(numpy.random.rand(*shape_1), dtype='float32')
+        b = cuda_ndarray.CudaNdarray(a)
+        assert b.shape == a.shape
+
+    for shape_1, shape_2 in shapelist:
+        subtest(shape_1)
+        subtest(shape_2)
+
+def test_stride_manipulation():
+
+    a = numpy.asarray([[0,1,2], [3,4,5]], dtype='float32')
+    b = cuda_ndarray.CudaNdarray(a)
+    v = b.view()
+    v._dev_data += 0
+    c = numpy.asarray(v)
+    assert numpy.all(a == c)
+
+    sizeof_float = 4
+    offset = 0
+
+    b_strides = b._strides
+    for i in xrange(len(b.shape)):
+        offset += (b.shape[i]-1) * b_strides[i]
+        v._set_stride(i, -b_strides[i])
+
+    v._dev_data += offset * sizeof_float
+    c = numpy.asarray(v)
+
+
+    assert numpy.all(c == [[5, 4, 3], [2, 1, 0]])
+
+
+def test_copy_subtensor0():
+    sizeof_float=4
+    a = numpy.asarray(numpy.random.rand(30,20,5,5), dtype='float32')
+    cuda_a = cuda_ndarray.CudaNdarray(a)
+    a_view = cuda_a.view()
+    a_view_strides = a_view._strides
+    a_view._set_stride(2, -a_view_strides[2])
+    a_view._set_stride(3, -a_view_strides[3])
+    a_view._dev_data += 24 * sizeof_float
+
+    a_view_copy = copy.deepcopy(a_view)
+
+    assert numpy.all(a[:,:,::-1,::-1] == numpy.asarray(a_view_copy))
+
+def test_mapping_getitem_ellipsis():
+    a = numpy.asarray(numpy.random.rand(5,4,3,2), dtype='float32')
+    a = cuda_ndarray.CudaNdarray(a)
+
+    b = a[...]
+    assert b._dev_data == a._dev_data
+    assert b._strides == a._strides
+    assert b.shape == a.shape
+
+def test_mapping_getitem_reverse_some_dims():
+    dim=(5,4,3,2)
+    a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
+    _a = cuda_ndarray.CudaNdarray(a)
+
+    _b = _a[:,:,::-1, ::-1]
+    
+    b = numpy.asarray(_b)
+    assert numpy.all(b==a[:,:,::-1,::-1])
+
+def test_mapping_getitem_w_int():
+    def _cmp(x,y):
+        assert x.shape == y.shape
+        if not numpy.all(x == y):
+            print x
+            print y
+        assert numpy.all(x == y)
+
+    dim =(2,)
+    a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
+    _a = cuda_ndarray.CudaNdarray(a)
+    _cmp(numpy.asarray(_a[1]), a[1])
+    _cmp(numpy.asarray(_a[::1]), a[::1])
+    _cmp(numpy.asarray(_a[::-1]), a[::-1])
+    _cmp(numpy.asarray(_a[...]), a[...])
+
+    dim =()
+    a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
+    _a = cuda_ndarray.CudaNdarray(a)
+    _cmp(numpy.asarray(_a[...]), a[...])
+
+
+
+    dim =(5,4,3,2)
+    a = numpy.asarray(numpy.random.rand(*dim), dtype='float32')
+    _a = cuda_ndarray.CudaNdarray(a)
+
+    _cmp(numpy.asarray(_a[:,:,::-1, ::-1]), a[:,:,::-1,::-1])
+    _cmp(numpy.asarray(_a[:,:,1,-1]), a[:,:,1,-1])
+    _cmp(numpy.asarray(_a[:,:,-1,:]), a[:,:,-1,:])
+    _cmp(numpy.asarray(_a[:,::-2,-1,:]), a[:,::-2,-1,:])
+    _cmp(numpy.asarray(_a[:,::-2,-1]), a[:,::-2,-1])
+    _cmp(numpy.asarray(_a[0,::-2,-1]), a[0,::-2,-1])
+    _cmp(numpy.asarray(_a[1]), a[1])
+    _cmp(numpy.asarray(_a[...]), a[...])
+
+def test_gemm_vector_vector():
+    a = numpy.asarray(numpy.random.rand(5,1), dtype='float32')
+    _a = cuda_ndarray.CudaNdarray(a)
+    b = numpy.asarray(numpy.random.rand(1,5), dtype='float32')
+    _b = cuda_ndarray.CudaNdarray(b)
+    
+    _c = cuda_ndarray.dot(_a,_b)
+    assert _c.shape == (5,5)
+    assert numpy.allclose(_c, numpy.dot(a, b))
+    
+    _c = cuda_ndarray.dot(_b,_a)
+    assert _c.shape == (1,1)
+    assert numpy.allclose(_c, numpy.dot(b, a))
--- a/theano/sandbox/cuda/type.py
+++ b/theano/sandbox/cuda/type.py
@@ -7,10 +7,9 @@ from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor
 import theano.config as config

+import cuda_ndarray.cuda_ndarray as cuda
 import cuda_ndarray

-from theano.sandbox.cuda import filter as type_support_filter
-
 from theano.sandbox.cuda.nvcc_compiler import nvcc_module_compile_str

 class CudaNdarrayType(Type):
@@ -51,7 +50,7 @@ class CudaNdarrayType(Type):
        self.dtype_specs() # error checking is done there

    def filter(self, data, strict=False):
-        return type_support_filter(data, self.broadcastable, strict)
+        return cuda.filter(data, self.broadcastable, strict)

    @staticmethod
    def values_eq(a, b):
@@ -254,7 +253,7 @@ class CudaNdarrayType(Type):
        return ret

    def c_libraries(self):
-        return ['cuda_ndarray', 'cudart']
+        return ['cudart']

    def c_support_code(cls):
        return ""
@@ -285,5 +284,5 @@ copy_reg.constructor(CudaNdarray_unpickler)
 def CudaNdarray_pickler(cnda):
    return (CudaNdarray_unpickler, (numpy.asarray(cnda),))

-copy_reg.pickle(cuda_ndarray.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
+copy_reg.pickle(cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)