move the convolution from cuda_ndarray to the GpuConvOp.

0dae8f43 · Frederic Bastien · ccc01a40 · 0dae8f43 · 0dae8f43 · 0dae8f43
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
 from theano import Op, Type, Apply, Variable, Constant
 from theano import tensor, scalar
-import StringIO
+import StringIO, os
 import cuda_ndarray.cuda_ndarray as cuda
 from theano.sandbox.cuda.type import CudaNdarrayType
@@ -186,626 +186,57 @@ class GpuConv(Op):
        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], False, False]
        return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
-    def perform(self, node, (img, kern), (out,)):
+    def c_compile_args(self):
-        out[0] = cuda.conv(img, kern, 
+        return ['-DDONT_UNROLL']
-                mode=self.border_mode,
-                out=out[0],
+    def c_headers(self):
-                subsample=self.subsample,
+        return ['cuda_ndarray.cuh','<stdio.h>']
-                logical_img_shape=self.logical_img_hw,
-                logical_kern_shape=self.logical_kern_hw,
-                kern_align=self.logical_kern_align_top,
-                version=self.version,
-                verbose=self.verbose)
-    def c_support_code_apply(self, node, nodename):
-        if self.logical_img_hw is None or self.logical_kern_hw is None:
-            return super(GpuConv,self).c_support_code_apply(node, nodename)
-        img_wid = self.logical_img_hw[1]
-        img_len = self.logical_img_hw[0]
-        kern_wid=self.logical_kern_hw[1]
-        kern_len=self.logical_kern_hw[0]
-        return"""
-const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
-#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
-  if (nb_thread < 64)
-    {
-      if(flipped) 
-        //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
-        for(int i=thread_id;i<N;i+=nb_thread)
-          dst[i]=src[N - 1 - i];
-        //dst[N-1-i]=src[i];
-      else
-        for(int i=thread_id;i<N;i+=nb_thread)
-          dst[i]=src[i];
-    }
-  else
-    {
-      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
-      // Global memory:
-      //  <-------------------------------------->
-      //      A      A      A      A      A   // points of 128-bit alignment
-      //         dddddddddddddddddddddd       // layout of src in global memory
-      //      |--|                            // masked_src_offset
-      // 
-      if (thread_id < nb_thread)
-        {
-          const int masked_src_offset = MASKED_OFFSET(src);
-          for(int masked_i=thread_id; masked_i<N + masked_src_offset; masked_i+=nb_thread)
-            {
-              int i = masked_i - masked_src_offset;
-              if (i >= 0)
-                if (flipped)
-                  dst[N-1-i] = src[i];
-                else
-                  dst[i]=src[i];
-            }
-        }
-    }
-}
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- */
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
-			       int nb_thread, const int nb_col, const int nb_row,
-			       const int stride_col, const int stride_row,
-			       const bool flipped=false, const bool c_contiguous=true){
-  if(flipped && ! c_contiguous){
-    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-      dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%%nb_col*stride_col];
-  }else if(c_contiguous){
-    load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
-  }else if(flipped){//c_contiguous==true
-    //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
-    int N=nb_col*nb_row;
-    for(int i=thread_id;i<N;i+=nb_thread)
-      dst[i]=src[N - 1 - i];
-    //dst[N-1-i]=src[i];
-  }else if(c_contiguous){//flipped==false
-    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread)
-      dst[i]=src[i];
-  }else{ // !flipped && !c_contiguous
-    /*
-    for(int i=thread_id;i<nb_row;i+=nb_thread){
-      float* s=&src[i*stride_row];
-      float* d=&dst[i*nb_col];
-      for(int j=thread_id;j<nb_col;i+=nb_thread)
-	//	dst[i*nb_col+j]=src[i*stride_row+j*stride_col];//dst[i]=src[i];
-	d[j]=s[j*stride_col];
-	}*/
-    /* We don't do this if as nvcc 2.3 take 2 more registers when we add the if
-       Why it do this?
-    if(stride_col==1 && stride_row==nb_col)
-      for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-	dst[i]=src[i];
-	else*/
-      for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-	dst[i]=src[i/nb_col*stride_row+i%%nb_col*stride_col];
-  }
-}
-__device__ void load_padded_col_to_shared(float * dst, const float * src, 
-					  const int thread_id, const int nb_thread,
-					  const int nb_col, const int nb_row, 
-					  const int stride_col, const int stride_row,
-					  const int wid_pad, const bool c_contiguous=true){
-  if(c_contiguous){//flipped==false
-    //template nb_col to have better performance!
-    int row=0;
-    int col=thread_id;
-    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread, col+=nb_thread){
-      col-=nb_col;row++;
-      while(col>nb_col){
-        col-=nb_col;row++;
-      }
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
-    }
-/*    
-    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
-      int col=i%%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
-    }
-*/
-  }else{
-    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
-      int col=i%%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
-    }
-  }
-}
-template<int i> __device__ float convolutionRowNoFlip(const float *data,
-						      const float *kern){
-  return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
-}
-template<> __device__ float convolutionRowNoFlip<0>(const float *data,
-						    const float *kern){
-    return 0;
-}
-template<int KERN_WIDTH>
-__device__ void convolutionRowNoFlip(float& sum,
-				     const float *data,
-				     const float *kern, const int kern_wid){
-  if(KERN_WIDTH>0)
-    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
-  else
-#pragma unroll 8
-    for (int col=0; col < kern_wid; col++) {//loop over col
-      sum+=data[col]*kern[col];
-    }
-}
-__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
-  for(int i=thread_id;i<N;i+=nb_thread)
-    dst[i]=value;
-}
-template <typename T>
-static T ceil_intdiv(T a, T b)
-{
-    return (a/b) + ((a %% b) ? 1: 0);
-}
-/**
- * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
- * I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
- * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
- * grid block size=batch_id, nkern
- * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
- * template split: if true, each thread compute more then 1 output pixel.
- * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
- *          if you set split and low_mem to true, we will use the low_mem version!
- */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
-__global__ void
-conv_full_patch_stack_padded( float* img, float* kern, float* out,
-		  const int img_len, const int img_wid,
-		  const int kern_len, const int kern_wid,
-		  const int nkern, const int nstack,
-		  const int img_stride_col, const int img_stride_row,
-		  const int img_stride_stack, const int img_stride_batch,
-		  const int kern_stride_col, const int kern_stride_row,
-		  const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-  out_len = %(img_len)s + %(kern_len)s - 1;
-  out_wid = %(img_wid)s + %(kern_wid)s - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  extern __shared__ float s_data[];
-    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-    nb_rows = blockDim.y;
-    // Thread index
-    const int tx = threadIdx.x;
-    const int ty = threadIdx.y;
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
-    float * d_img=&s_data[%(kern_len)s*%(kern_wid)s];//size of [see fct doc];
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-    img_wid_valid=%(img_wid)s+2*%(kern_wid)s-2;
-    if(!split && !low_mem){
-      fill(d_img,img_wid_valid*(%(img_len)s+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
-      const int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-	     img+=img_stride_stack){
-	  __syncthreads();
-	load_padded_col_to_shared(d_img+img_wid_valid*(%(kern_len)s-1),img,
-				  thread_id,nb_thread_id,%(img_wid)s,%(img_len)s,
-				  img_stride_col, img_stride_row, %(kern_wid)s-1,
-				  c_contiguous);
-	load_to_shared(d_kern, kern, thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
-		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	__syncthreads();
-	for (int row=0; row < %(kern_len)s; row++) {//loop over row
-	  const float* idx_kern=&d_kern[row*%(kern_wid)s];
-	  const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-	  convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
-	}
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-	  kern_id*out_wid*out_len+//the output image
-	  out_row*out_wid+out_col] = sum;
-    }else if(split && !low_mem){
-      fill(d_img,img_wid_valid*(%(img_len)s+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      out_len_max = (out_len/blockDim.y+(out_len %% blockDim.y==0?0:1))*blockDim.y;
-      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
-	float sum = 0.0f;
-	for (int stack = 0;stack<nstack;stack++){
-	  __syncthreads();
-	  //TODO: load only the part of the image needed or put the partial result in shared memory
-	  load_padded_col_to_shared(d_img+img_wid_valid*(%(kern_len)s-1),
-				    img+img_stride_stack*stack,
-				    thread_id,nb_thread_id,%(img_wid)s,%(img_len)s,
-				    img_stride_col, img_stride_row, %(kern_wid)s-1,
-				    c_contiguous);
-	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
-			 thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
-			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	  __syncthreads();
-	  for (int row=0; row < %(kern_len)s; row++) {//loop over row
-	    const float* idx_kern=&d_kern[row*%(kern_wid)s];
-	    const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
-	  }
-	  if(out_row<out_len)
-	    out[batch_id*out_wid*out_len*nkern+//the good batch
-		out_wid*out_len*kern_id+//the output image
-		out_row*out_wid+out_col] = sum;
-	}
-      }
-    }else{//low_mem version
-      //don't need to fill the last rows padding as this is done later.
-      fill(d_img,img_wid_valid*((%(kern_len)s+nb_rows-1)+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      if(thread_id==0)
-	out_len_max = (out_len/nb_rows+(out_len %% nb_rows==0?0:1))*nb_rows;
-      __syncthreads();
-      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
-	  out_row+=nb_rows, out_row_iter++){
-	float sum = 0.0f;
-	for (int stack = 0;stack<nstack;stack++){
-	  __syncthreads();
-	  const int len_to_load=min(%(kern_len)s+nb_rows,%(img_len)s-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
-	  const int empty_row = max(%(kern_len)s-1-out_row_iter*nb_rows,0);//number of empty row at the start
-	  //we need to reload some row as when we change of out_row we lost the last load du to the stack.
-	  const int previous_row = min(out_row_iter*nb_rows,%(kern_len)s-1);//number of row from last out_row iteration to reload
-	  load_padded_col_to_shared(d_img+(%(kern_len)s-1-previous_row)*img_wid_valid,
-				    img+img_stride_stack*stack//the good stack image
-				    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
-				    thread_id,nb_thread_id,%(img_wid)s,
-				    len_to_load+previous_row,
-				    img_stride_col, img_stride_row, %(kern_wid)s-1,
-				    c_contiguous);
-	  __syncthreads();
-	  //TODO: fill the last row padding only when needed.
-	  //We always fill the last rows padding event when not needed.
-	  int row_to_fill = 2*%(kern_len)s-2+nb_rows- empty_row - previous_row - len_to_load;
-	  row_to_fill = min(row_to_fill,%(kern_len)s-1);
-	  fill(d_img+(%(kern_len)s-1+len_to_load)*img_wid_valid,
-	       img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
-	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
-			 thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
-			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	  __syncthreads();
-	  for (int row=0; row < %(kern_len)s; row++) {//loop over row
-	    const float* idx_kern=&d_kern[row*%(kern_wid)s];
-	    const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
-	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
-	  }
-	}
-	if(out_row<out_len)
-	  out[batch_id*out_wid*out_len*nkern+//the good batch
-	      out_wid*out_len*kern_id+//the output image
-	      out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-	void (*f_contig_3_flipped)(float*, float*, float*,
+    def c_support_code_apply(self, node, nodename):
-		  int, int, int, int,
+        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
-		  int, int, int, int,
+            open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
-		  int, int, int, int,
+            open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
-		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,false,false>;
-	void (*f_contig_4_flipped)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,true,false>;
-	void (*f_contig_5_flipped)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,false,true>;
-	void (*f_3_flipped)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,false,false>;
-	void (*f_4_flipped)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,true,false>;
-	void (*f_5_flipped)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,false,true>;
-	void (*f_contig_3)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,false,false>;
-	void (*f_contig_4)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,true,false>;
-	void (*f_contig_5)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,false,true>;
-	void (*f_3)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,false,false>;
-	void (*f_4)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,true,false>;
-	void (*f_5)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,false,true>;
-"""%locals()
    def c_code(self, node, nodename, (img, kern), (out,), sub):
-        out_=node.outputs[0]
+        dx = self.subsample[0]
-        img_=node.inputs[0]
+        dy = self.subsample[1]
-        kern_=node.inputs[1]
+        border_mode = self.border_mode
-        subsample_rows=self.subsample[0]
+        version = self.version
-        subsample_cols=self.subsample[1]
+        verbose = self.verbose
-        version=self.version
+        sub = sub.copy()
-        verbose=self.verbose
+        sub.update(locals())
-        if self.logical_img_hw is None or self.logical_kern_hw is None:
+        return """
-            return super(GpuConv,self).c_code(node,nodename,(img, kern), (out,),sub)        
+    //Mandatory args
-        #todo assert out is ccontiguous
+    const char *mode_str = "%(border_mode)s";
-        img_wid = self.logical_img_hw[1]
-        img_len = self.logical_img_hw[0]
-        kern_wid = self.logical_kern_hw[1]
-        kern_len=self.logical_kern_hw[0]
-        img_wid_padded=self.logical_img_hw[1]+2*self.logical_kern_hw[1]-2;
-        img_len_padded=self.logical_img_hw[0]+2*self.logical_kern_hw[0]-2;
-        img_size_padded=img_len_padded * img_wid_padded;
-        out_dim_2, out_dim_3 = self.logical_output_shape_2d(self.logical_img_hw,self.logical_kern_hw,self.border_mode)
-        fail=sub['fail']
+    //Optional args
-        if False and self.subsample==(1,1) and self.border_mode=='full' and self.version in [3,4,5,-1] and out_dim_3<=512 and ((self.logical_kern_hw[0]+2*self.logical_kern_hw[0]-2)*img_wid_padded*4 + self.logical_kern_hw[0]*self.logical_kern_hw[1]*4<(16*1024-128)) and out_.dtype=='float32' and kern_.dtype=='float32' and img_.dtype=='float32':#-128 as this is the number of shared memory used statically
+    int version = %(version)s;
-            return """
+    int verbose = %(verbose)s;
+    int dx = %(dx)s;
+    int dy = %(dy)s;
-CudaNdarray* img = %(img)s;
+    int mode;
-CudaNdarray* kern = %(kern)s;
+    if (strcmp(mode_str, "full") == 0)
-CudaNdarray* out_ = %(out)s;
-CudaNdarray* out = out;
-int version = %(version)s;
-const int verbose = %(verbose)s;
-    if (!img || img->nd != 4)
    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
+        mode = ConvMode_FULL;
-        return -1;
    }
-    if (! kern || kern->nd != 4)
+    else if (strcmp(mode_str, "valid") == 0)
    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
+        mode = ConvMode_VALID;
-        return -1;
-    }
-    int out_dim[4]={CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
-                    %(out_dim_2)s, %(out_dim_3)s};
-    if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out) 
-	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
-	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
-	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
-	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
-      if (out)
-      {
-          Py_DECREF(out);
-          fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\\n");
-      }
-      out = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
-      %(out)s = out;
    }
+    else
-    if (! out || out->nd != 4)
    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
+        PyErr_SetString(PyExc_ValueError, "mode must be one of 'full' or 'valid'");
-        return -1;
+        return NULL;
-    }
-    if (%(subsample_rows)s==1 && %(subsample_cols)s==1)
-    {
-        //TODO: rethink these asserts in light of the difference between physical and logical dimensions
-        assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
-        assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
-    }
-    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
-    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
-    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
-    //TODO: make separate version as if all fill this is slower. 
-    //TODO: make a parameter the number of division
-    //TODO: Should we make them in separate grid block instead?
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
-    const int img_wid=%(img_wid)s;
-    const int img_len=%(img_len)s;
-    const int kern_wid=%(kern_wid)s;
-    const int kern_len=%(kern_len)s;
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
-    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
-    const int img_size=img_len*img_wid;
-    const int kern_size=%(kern_len)s*%(kern_wid)s;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    const int out_size_byte = out_size*sizeof(float);
-    bool subsample = %(subsample_rows)s!=1 || %(subsample_cols)s!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==%(kern_wid)s);
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-   if (!subsample &&
-	out_contiguous &&
-	(version==3||version==4||version==5||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	(%(kern_len)s+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte<16*1024 //their is only 16k of shared memory
-	) //conv_full_patch_stack_padded
-    {
-      //version 3 without split
-      //version 4 with split (more registers)
-      //version 5 with split (more registers) low mem version(some restriction and still more register)
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
-	if(%(kern_len)s==1 && version==5){
-	  //version 5 don't support %(kern_len)s==1 as 1%%0 return -1.
-	  version=-1;
-	  if(verbose)printf("WARNING:conv full: Asking version 5 with %(kern_len)s==1. Combination not supported!\\n");
-	}
-	if(%(img_size_padded)s*4+kern_size_byte>16*1024) version=5;
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-	//Max of 16k of shared memory
-	if(version==5)
-	  while ((((%(kern_len)s+ceil_intdiv(out_len,nb_split)-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte)>16*1024) nb_split++;
-	//327 as we use 25 register
-	//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
-	int max_thread = (version!=5?327:450);
-	while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-	if(version==-1 && out_size>512)version=4;
-	if(version==-1)version=3;
-	if(version==-1 && nb_split>1) version=4;
-	else if(version==-1) version=3;
-	else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
-	assert(version!=3 || nb_split==1);
-	assert(version!=5 || %(kern_len)s>1);
-	assert(version!=-1);
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch,nkern);
-	int shared_size=%(img_size_padded)s*4 + kern_size_byte;
-	if(version==5)
-	  shared_size=((%(kern_len)s+threads.y-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte;
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
-        if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=f_contig_3_flipped;
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=f_contig_4_flipped;
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=f_contig_5_flipped;
-	else if(version==3 && kern_flipped) f=f_3_flipped;
-	else if(version==4 && kern_flipped) f=f_4_flipped;
-	else if(version==5 && kern_flipped) f=f_5_flipped;
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=f_contig_3;
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=f_contig_4;
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=f_contig_5;
-	else if(version==3) f=f_3;
-	else if(version==4) f=f_4;
-	else if(version==5) f=f_5;
-	else assert(false);
-	f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern_data_unflipped, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	      img_stride_col, img_stride_row, img_stride_stack,
-	      img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
-	      kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i,shared_size=%%i, nb_threads=%%i, out_len=%%i, nb_split=%%i, version=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
-            if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%%d low_mem=%%s\\n",nb_split,(version==5?"true":"false"));
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i,shared_size=%%i, nb_threads=%%i, out_len=%%i, nb_split=%%i, version=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
-            if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %%s %%s failed (%%s), trying next implementation\\n",
-				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
-                                cudaGetErrorString(sts));
-//TODO: raise an error!
-            PyErr_Format(PyExc_RuntimeError, "INFO: impl 'conv_full_patch_stack_padded' %%s %%s failed (%%s), trying next implementation\\n",
-				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
-                                cudaGetErrorString(sts));
-            %(fail)s;
-        }                         
    }
+    CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
+                     mode, dx, dy, version, verbose);
+    if(%(out)s && %(out)s==out2)
+         Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
+    %(out)s = out2;
+"""%sub
-"""%locals()
-        else:
-            super(GpuConv,self).c_code(node,nodename,(img, kern), (out,),sub)
 class GpuDownsampleFactorMax(Op):
    def __init__(self, ds, ignore_border=False):

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
+enum { ConvMode_FULL, ConvMode_VALID };
+PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
+bool msgdisplayed_conv_patch__kern_width = false;
+bool msgdisplayed_conv_patch_stack__kern_width = false;
+bool msgdisplayed_conv_rows__kern_width = false;
+bool msgdisplayed_conv_rows_stack__kern_width = false;
+bool msgdisplayed_conv_rows_stack2__kern_width = false;
+bool msgdisplayed_conv_patch_stack_reduce__kern_width = false;
+bool msgdisplayed_conv_full_patch_stack__kern_width = false;
+/*
+ * version: -1, autodetect, >=0 a specific version to use.
+ *          If it can't be executed, we revert to the reference implementation
+ */
+int 
+CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
+		       CudaNdarray * out, int subsample_rows, int subsample_cols,
+		       int version = -1, int verbose=0)
+{
+    int work_complete = 0;
+    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
+    if (img->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required img of 4D");
+        return -1;
+    }
+    if (kern->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
+        return -1;
+    }
+    if (out->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required out of 4D");
+        return -1;
+    }
+    if (subsample_rows==1 && subsample_cols==1)
+    {
+        //TODO: rethink these asserts in light of the difference between physical and logical dimensions
+        assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1);
+        assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1);
+    }
+    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+    // we now search through a few implementations until one applies to our arguments.
+    //TODO: make separate version as if all fill this is slower. 
+    //TODO: Make a switch with power of 2 max size as template
+    //TODO: make a parameter the number of division
+    //TODO: Should we make them in separate grid block instead?
+    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
+    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
+    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
+    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+    const int img_size=img_len*img_wid;
+    const int kern_size=kern_len*kern_wid;
+    const int out_size=out_len*out_wid;
+    const int img_size_byte = img_size*sizeof(float);
+    const int kern_size_byte = kern_size*sizeof(float);
+    const int out_size_byte = out_size*sizeof(float);
+    bool subsample = subsample_rows!=1 || subsample_cols!=1;
+    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
+    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
+    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
+    //allow to use a version that use less registers(so is faster)
+    //the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
+    bool kern_flipped=true;
+    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
+    float * kern_data_unflipped = kern->devdata;
+    int kern_stride_col_unflipped=kern_stride_col;
+    int kern_stride_row_unflipped=kern_stride_row;
+    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
+      //the last two dimensions are c_contiguous but flipped!
+      kern_stride_col_unflipped=1;
+      kern_stride_row_unflipped=kern_wid;
+      kern_flipped=false;
+      kern_contiguous_2d_unflipped = true;
+      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+    }
+    if (verbose>1)
+    {
+        printf("INFO: Running conv_valid version %d with inputs:\n",version);
+        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
+                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
+        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
+    }
+    //if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
+    if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
+      //condition for exec 
+      if(!subsample &&
+	out_contiguous &&
+	out_size<512 &&//Maximum of 512 theads by block
+	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+	!work_complete)
+	version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
+    }
+    if (!subsample && c_contiguous &&
+	(version==0||version==2||version==-1) &&
+	out_wid<512 &&//Maximum of 512 theads by block
+	nstack == 1 &&// don't implement the stack in the kernel.
+	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+	!work_complete) //conv_patch
+    {
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+	if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
+	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+	while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+        dim3 grid(nbatch, nkern);
+        int shared_size=(img_size + kern_size)*sizeof(float);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int);
+#define CONV_PATCH_SPECIAL(kern_wid) \
+            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
+            else f=conv_patch<true,kern_wid,true>;
+	 switch(kern_wid){
+#ifdef UNROLL_LOOP
+	 case 1: CONV_PATCH_SPECIAL(1); break;//test_conv.py:test_valid
+	 case 2: CONV_PATCH_SPECIAL(2); break;//test_conv.py:test_valid
+	 case 3: CONV_PATCH_SPECIAL(3); break;//test_conv.py:test_valid
+	 case 4: CONV_PATCH_SPECIAL(4); break;
+	 case 5: CONV_PATCH_SPECIAL(5); break;
+	 case 6: CONV_PATCH_SPECIAL(6); break;
+	 case 7: CONV_PATCH_SPECIAL(7); break;
+	 case 10: CONV_PATCH_SPECIAL(10); break;
+#endif
+	 default:
+	   if(!msgdisplayed_conv_patch__kern_width) {
+	     printf("OPTIMISATION WARNING: conv_patch template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+	     msgdisplayed_conv_patch__kern_width=true;
+	   }
+	   CONV_PATCH_SPECIAL(0);
+	 }
+	 f<<< grid, threads, shared_size>>>
+	     (img->devdata, kern->devdata, out->devdata,
+	      img_len, img_wid, kern_len, kern_wid, nkern, nstack);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
+            if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (!subsample &&
+	out_contiguous &&
+	(version==1||version==3||version==11||version==12||version==-1) &&
+	(version!=1 || out_size<512) &&//Maximum of 512 theads by block
+	out_wid<512 &&//Maximum of 512 theads by block
+	img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
+	!work_complete) //conv_patch_stack
+    {
+      //version 1 is without split and preload the full kernel
+      //version 3 is with split and preload the full kernel
+      //version 11 is without split and load only 1 kernel row at a time.
+      //version 12 is with split and load only 1 kernel row at a time.
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+	if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
+	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+	while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+	bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
+	if(version==11 || version==12) preload_full_kernel=false;
+        dim3 grid(nbatch,nkern);
+        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int);
+#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
+            if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;\
+            if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;\
+            if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;\
+            if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;\
+            if(preload_full_kernel && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;\
+            if(preload_full_kernel && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;\
+            if(preload_full_kernel && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;\
+            if(preload_full_kernel && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;\
+            if(nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;\
+            if(nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;\
+            if(nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;\
+            if(nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;\
+            if(img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;\
+            if(img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;\
+            if(!img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;\
+            if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,false>;
+	switch(kern_wid){
+#ifdef UNROLL_LOOP
+         case 1: CONV_PATCH_STACK_SPECIAL(1); break;
+         case 2: CONV_PATCH_STACK_SPECIAL(2); break;
+         case 3: CONV_PATCH_STACK_SPECIAL(3); break;
+         case 4: CONV_PATCH_STACK_SPECIAL(4); break;
+         case 5: CONV_PATCH_STACK_SPECIAL(5); break;
+         case 6: CONV_PATCH_STACK_SPECIAL(6); break;
+         case 7: CONV_PATCH_STACK_SPECIAL(7); break;
+         case 8: CONV_PATCH_STACK_SPECIAL(8); break;
+         case 9: CONV_PATCH_STACK_SPECIAL(9); break;
+         case 10: CONV_PATCH_STACK_SPECIAL(10); break;
+                  //////// Special cases
+         case 12: CONV_PATCH_STACK_SPECIAL(12); break;//on cifar10
+         case 21: CONV_PATCH_STACK_SPECIAL(21); break;//on cifar10
+         case 23: CONV_PATCH_STACK_SPECIAL(23); break;//test_nnet.py:test_lenet_64
+         case 24: CONV_PATCH_STACK_SPECIAL(24); break;//on cifar10
+         case 25: CONV_PATCH_STACK_SPECIAL(25); break;//on cifar10
+         case 28: CONV_PATCH_STACK_SPECIAL(28); break;
+         case 32: CONV_PATCH_STACK_SPECIAL(32); break;// Alex speed example
+         case 45: CONV_PATCH_STACK_SPECIAL(45); break;//used by test_nnet.py:test_lenet_108
+#endif
+                  //////// default case
+        default:
+            if(!msgdisplayed_conv_patch_stack__kern_width) {
+                printf("OPTIMISATION HINT: conv_patch_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+                msgdisplayed_conv_patch_stack__kern_width = true;
+            }
+            CONV_PATCH_STACK_SPECIAL(0);
+	}
+	f<<< grid, threads, shared_size>>>
+	     (img->devdata, kern->devdata, out->devdata,
+	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	      img_stride_col, img_stride_row, img_stride_stack,
+	      img_stride_batch, kern_stride_col, kern_stride_row,
+	      kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
+				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
+            if (verbose) printf("INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
+				nb_split,preload_full_kernel);
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
+				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
+            if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (!subsample && out_contiguous &&
+	(version==4||version==-1) &&
+	out_wid<512 &&//Maximum of 512 threads by block
+	nstack == 1 &&// don't implement the stack in the kernel.
+	kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
+	!work_complete) //conv_rows
+    {
+        dim3 threads(out_wid);
+        dim3 grid(out_len, nbatch*nkern);
+        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int);
+#define CONV_ROWS_SPECIAL(kern_wid) \
+	if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
+	else f = conv_rows<kern_wid, true>;\
+	switch(kern_wid){
+#ifdef UNROLL_LOOP
+	case 1: CONV_ROWS_SPECIAL(1); break;//test_conv.py:test_valid
+	case 2: CONV_ROWS_SPECIAL(2); break;//test_conv.py:test_valid
+	case 3: CONV_ROWS_SPECIAL(3); break;//test_conv.py:test_valid
+	case 4: CONV_ROWS_SPECIAL(4); break;//test_conv.py:test_valid
+	case 5: CONV_ROWS_SPECIAL(5); break;//test_conv.py:test_valid
+	  //	case 6: CONV_ROWS_SPECIAL(6); break;
+	case 7: CONV_ROWS_SPECIAL(7); break;//used by test_nnet.py:test_lenet_108
+	  //	case 8: CONV_ROWS_SPECIAL(8); break;
+	case 9: CONV_ROWS_SPECIAL(9); break;//used by test_nnet.py:test_lenet_256
+	case 10: CONV_ROWS_SPECIAL(10); break;//test_conv.py:test_valid
+	  //////// Special cases
+	case 28: CONV_ROWS_SPECIAL(28); break;
+#endif
+	  //////// default case
+	default:
+	  if(!msgdisplayed_conv_rows__kern_width){
+	    printf("OPTIMISATION HINT: conv_rows template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+	    msgdisplayed_conv_rows__kern_width = true;
+	  }
+	  CONV_ROWS_SPECIAL(0);
+	}
+	f<<< grid, threads, shared_size >>>
+	  (img->devdata, kern->devdata, out->devdata,
+	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	   img_stride_col, img_stride_row,
+	   img_stride_stack,img_stride_batch,
+	   kern_stride_col, kern_stride_row,
+	   kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+            if (verbose) printf("INFO: used 'conv_rows' version\n");
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
+                    cudaGetErrorString(sts));
+        }                         
+    }
+    if (!subsample && out_contiguous &&
+	(version==5||version==-1) &&
+	out_wid<512 &&//Maximum of 512 theads by block
+	img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
+	!work_complete) //conv_rows_stack
+    {
+	int nb_row=1;
+	int max_threads=512;
+	//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
+	for(int i=2;i<=out_len;i++){
+	  if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
+	    nb_row=i;
+	}
+        dim3 threads(out_wid,nb_row);
+        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
+        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int);
+#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
+	if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
+	else f = conv_rows_stack<kern_wid, true>;\
+	switch(kern_wid){
+#ifdef UNROLL_LOOP
+	case 1: CONV_ROWS_STACK_SPECIAL(1); break;//test_conv.py:test_valid
+	case 2: CONV_ROWS_STACK_SPECIAL(2); break;
+	case 3: CONV_ROWS_STACK_SPECIAL(3); break;//test_conv.py:test_valid
+	case 4: CONV_ROWS_STACK_SPECIAL(4); break;//test_conv.py:test_valid
+	case 5: CONV_ROWS_STACK_SPECIAL(5); break;//test_conv.py:test_valid
+	case 6: CONV_ROWS_STACK_SPECIAL(6); break;//test_conv.py:test_valid
+	case 7: CONV_ROWS_STACK_SPECIAL(7); break;//test_nnet.py:test_lenet_108
+	case 8: CONV_ROWS_STACK_SPECIAL(8); break;//test_conv.py:test_valid
+	case 9: CONV_ROWS_STACK_SPECIAL(9); break;//test_nnet.py:test_lenet_256
+	case 10: CONV_ROWS_STACK_SPECIAL(10); break;//test_conv.py:test_valid
+	  //////// Special cases
+	case 23: CONV_ROWS_STACK_SPECIAL(23); break;//test_conv.py:test_valid
+	case 24: CONV_ROWS_STACK_SPECIAL(24); break;//test_conv.py:test_valid
+	case 28: CONV_ROWS_STACK_SPECIAL(28); break;//test_conv.py:test_valid
+	case 45: CONV_ROWS_STACK_SPECIAL(45); break;//test_nnet.py:test_lenet_64
+	case 102: CONV_ROWS_STACK_SPECIAL(102); break;//test_nnet.py:test_lenet_108
+#endif
+	  //////// default case
+	default:
+	  if(!msgdisplayed_conv_rows_stack__kern_width){
+	    printf("OPTIMISATION HINT: conv_rows_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+	    msgdisplayed_conv_rows_stack__kern_width = true;
+	  }
+	  CONV_ROWS_STACK_SPECIAL(0);
+	}
+	f<<< grid, threads, shared_size >>>
+	  (img->devdata,
+	   kern->devdata,
+	   out->devdata,
+	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	   img_stride_col, img_stride_row,
+	   img_stride_stack,img_stride_batch,
+	   kern_stride_col, kern_stride_row,
+	   kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+	    if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: used 'conv_rows_stack' version\n");
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
+		     cudaGetErrorString(sts));
+        }                         
+    }
+    if (!subsample && out_contiguous &&
+	(version==9||version==10||version==-1) &&
+	out_wid<512 &&//Maximum of 512 threads by block
+	(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
+	(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
+	!work_complete) //conv_rows_stack2
+    {
+	int nb_row=1;
+	int max_threads=512;
+	int version_back = version;
+	//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
+	if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
+	  version = 9;
+	else if(version==-1)version = 10;
+	int k_size = kern_size;
+	if(version==10)
+	  k_size=kern_wid;
+	for(int i=2;i<=out_len;i++){
+	  if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
+	    nb_row=i;
+	}
+	//to test the case when we don't have a thread by output pixel.
+	if((version_back!=-1)&& nb_row>1) nb_row--;
+        dim3 threads(out_wid,nb_row);
+        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
+        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int);
+#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
+	if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
+	else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
+	else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
+	else f = conv_rows_stack2<kern_wid, true, false>;\
+	switch(kern_wid){
+#ifdef UNROLL_LOOP
+	case 1: CONV_ROWS_STACK2_SPECIAL(1); break;//test_conv.py:test_valid
+	case 2: CONV_ROWS_STACK2_SPECIAL(2); break;
+	case 3: CONV_ROWS_STACK2_SPECIAL(3); break;//test_conv.py:test_valid
+	case 4: CONV_ROWS_STACK2_SPECIAL(4); break;//test_conv.py:test_valid
+	case 5: CONV_ROWS_STACK2_SPECIAL(5); break;//test_conv.py:test_valid
+	case 6: CONV_ROWS_STACK2_SPECIAL(6); break;//test_conv.py:test_valid
+	case 7: CONV_ROWS_STACK2_SPECIAL(7); break;//test_nnet.py:test_lenet_108
+	case 8: CONV_ROWS_STACK2_SPECIAL(8); break;//test_conv.py:test_valid
+	  //	case 9: CONV_ROWS_STACK2_SPECIAL(9); break;
+	case 10: CONV_ROWS_STACK2_SPECIAL(10); break;//test_conv.py:test_valid
+	  //////// Special cases
+	case 23: CONV_ROWS_STACK2_SPECIAL(23); break;//test_conv.py:test_valid
+	case 24: CONV_ROWS_STACK2_SPECIAL(24); break;//test_conv.py:test_valid
+	case 28: CONV_ROWS_STACK2_SPECIAL(28); break;//test_conv.py:test_valid
+	case 45: CONV_ROWS_STACK2_SPECIAL(45); break;//test_nnet.py:test_lenet_108
+	case 58: CONV_ROWS_STACK2_SPECIAL(58); break;//test_nnet.py:test_lenet_108
+	case 70: CONV_ROWS_STACK2_SPECIAL(70); break;//mobahi_2009.py
+	case 102: CONV_ROWS_STACK2_SPECIAL(102); break;//test_nnet.py:test_lenet_108
+	case 116: CONV_ROWS_STACK2_SPECIAL(116); break;//test_nnet.py:test_lenet_256
+	case 248: CONV_ROWS_STACK2_SPECIAL(248); break;//test_nnet.py:test_lenet_256
+#endif
+	  //////// default case
+	default:
+	  if(!msgdisplayed_conv_rows_stack2__kern_width){
+	    printf("OPTIMISATION HINT: conv_rows_stack{2,3} template default add"
+		   " kern_wid=%d in %s at line %i to have an optimized version"
+		   " for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+	    msgdisplayed_conv_rows_stack2__kern_width = true;
+	  }
+	  CONV_ROWS_STACK2_SPECIAL(0);
+	}
+	f<<< grid, threads, shared_size >>>
+	  (img->devdata,
+	   kern->devdata,
+	   out->devdata,
+	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	   img_stride_col, img_stride_row,
+	   img_stride_stack,img_stride_batch,
+	   kern_stride_col, kern_stride_row,
+	   kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+	    if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
+				  threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
+				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
+            if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
+		     cudaGetErrorString(sts));
+        }                         
+    }
+    //version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
+    //version 8 need a minimum of kernel length as we force the split.
+    //version 8 is needed to test more easily this kernel template parameter.
+    //version 13 load only 1 kernel row at a time.
+    if (!subsample &&
+	out_contiguous &&
+	out_size<512 &&//Maximum of 512 theads by block
+	(version==7||version==8||version==13||version==-1) &&
+	(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
+	(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
+	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+	!work_complete) //conv_patch_stack_reduce
+    {
+	int nb_split=1;
+	int full_kern=true;
+	if(version==8||version==13) nb_split++;//force the split.
+	if(version==13)full_kern=false;
+	while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
+	while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
+	int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
+	if(shared_size>=shared_avail){
+	  //if we can't fit the kernel in shared memory, we can split it more.
+	  full_kern=false;	  
+	  assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
+	  shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
+	  while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
+	    nb_split++;
+	    shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
+	  }
+	}
+	int thread_z=ceil_intdiv(kern_len,nb_split);
+        assert(thread_z>0);//should not happen, but in case...
+	assert(shared_size<=shared_avail);
+	if(!full_kern)
+	  assert(thread_z!=kern_len);
+        dim3 threads(out_wid, out_len, thread_z);
+        dim3 grid(nbatch,nkern);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int,
+		  int, int,
+		  int, int);
+	const bool split=thread_z!=kern_len;
+	const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
+	//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
+	//We will always be split when we don't load the full kernel
+#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
+            if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
+            else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
+            else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
+            else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
+            else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
+            else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
+            else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
+            else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
+	    /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
+	    /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
+            else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
+            else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
+            /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
+            /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
+            else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
+            else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
+	switch(kern_wid){
+#ifdef UNROLL_LOOP
+         case 1: CONV_PATCH_STACK_REDUCE_SPECIAL(1); break;
+         case 2: CONV_PATCH_STACK_REDUCE_SPECIAL(2); break;
+         case 3: CONV_PATCH_STACK_REDUCE_SPECIAL(3); break;
+         case 4: CONV_PATCH_STACK_REDUCE_SPECIAL(4); break;
+         case 5: CONV_PATCH_STACK_REDUCE_SPECIAL(5); break;
+         case 6: CONV_PATCH_STACK_REDUCE_SPECIAL(6); break;
+         case 7: CONV_PATCH_STACK_REDUCE_SPECIAL(7); break;
+         case 8: CONV_PATCH_STACK_REDUCE_SPECIAL(8); break;
+         case 9: CONV_PATCH_STACK_REDUCE_SPECIAL(9); break;
+         case 10: CONV_PATCH_STACK_REDUCE_SPECIAL(10); break;
+                  //////// Special cases
+         case 20: CONV_PATCH_STACK_REDUCE_SPECIAL(20); break;
+         case 23: CONV_PATCH_STACK_REDUCE_SPECIAL(23); break;//test_nnet.py:test_lenet64
+         case 24: CONV_PATCH_STACK_REDUCE_SPECIAL(24); break;
+         case 28: CONV_PATCH_STACK_REDUCE_SPECIAL(28); break;
+         case 32: CONV_PATCH_STACK_REDUCE_SPECIAL(32); break;//Alex speed demonstration
+#endif
+                  //////// default case
+        default:
+            if(!msgdisplayed_conv_patch_stack_reduce__kern_width) {
+                printf("OPTIMISATION HINT: conv_patch_stack_reduce template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+                msgdisplayed_conv_patch_stack_reduce__kern_width = true;
+            }
+            CONV_PATCH_STACK_REDUCE_SPECIAL(0);
+	}
+	if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version nb_split=%d, preload_full_kern=%d\n",
+			    nb_split,full_kern);
+	if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
+			      threads.x, threads.y, threads.z, grid.x, grid.y,
+			      shared_size, threads.x * threads.y * threads.z);
+	f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
+					   img_len, img_wid, kern_len, kern_wid,
+					   nkern, nstack, 
+					   img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
+					   kern_stride_col_unflipped, kern_stride_row_unflipped,
+					   kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
+            if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (1 && (version==6||version==-1) &&
+	!work_complete) //conv_valid_row_reduce
+    {
+        int outsize = CudaNdarray_SIZE(out);
+        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+	int block_nstack=nstack;
+	//Max of 512 threads per blocks.
+	//On old hardware, we have a max of 356 threads as we have only 
+	//8k registers and the kernel use 23 register
+	//TODO: check if we have 8k or 16k of register...
+	while(block_nstack*kern_len>320)block_nstack--;
+        dim3 n_threads(block_nstack, kern_len, 1);
+        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
+        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
+         *
+         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
+         * */
+        int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
+        if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
+            initial_reduce_boundary >>= 1;
+        if (n_reduce_buf == sizeof(float))
+            assert (initial_reduce_boundary == 0);
+        else
+        {
+            assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
+            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
+        }
+	void (*f)(int, int, int, int,
+		  int, int, int, int, int,
+		  float*, int, int, int, int,
+		  float*, int, int, int, int,
+		  float*, int, int, int, int,
+		  int, int, int);
+        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
+        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
+        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
+	if(block_nstack==nstack)
+	  f=conv_valid_row_reduce<false>;
+	else
+	  f=conv_valid_row_reduce<true>;
+	f<<<n_blocks, n_threads, n_reduce_buf>>>(
+                nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
+                img_len, img_wid,
+                kern_len, kern_wid,
+                out_len, out_wid,
+                img->devdata,
+                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], 
+                img_stride_row, img_stride_col,
+                kern->devdata,
+                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata,
+                CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
+                CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                subsample_rows, subsample_cols, initial_reduce_boundary);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+            if (verbose) printf("INFO: used 'conv_valid_row_reduce' version\n");
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
+            if (verbose) printf("INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
+                    cudaGetErrorString(sts));
+        }
+    }
+    if (1 && !work_complete) //conv_reference_valid
+    {
+        int outsize = CudaNdarray_SIZE(out);
+        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
+        if (0)
+        {
+            if (verbose) printf("INFO: launching conv_reference_valid\n");
+            if (verbose) printf("      img : %i %i %i %i %p  %i %i %i %i\n",
+                    nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
+                    img->devdata,
+                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
+            if (verbose) printf("      kern: %i %i %i %i %p  %i %i %i %i\n", 
+                    nkern, nstack, kern_len, kern_wid,
+                    kern->devdata,
+                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
+                        );
+            if (verbose) printf("      out : %i %i %i %i %p  %i %i %i %i\n",
+                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
+                    out->devdata,
+                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
+            if (verbose) printf("   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
+        }
+        conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
+                img_len, img_wid,
+                kern_len, kern_wid,
+                out_len, out_wid,
+                img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
+                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                subsample_rows, subsample_cols);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+            if (verbose) printf("INFO: used 'conv_reference_valid' version\n");
+        }
+        else
+        {
+            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
+                    cudaGetErrorString(sts));
+            return -1;
+        }
+    }
+    return 0;
+            //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
+            //return -1;
+}
+int 
+CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
+{
+    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
+    int work_complete = 0;
+    if (img->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required img of 4D");
+        return -1;
+    }
+    if (kern->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
+        return -1;
+    }
+    if (out->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required out of 4D");
+        return -1;
+    }
+    if (0)
+    {
+        //TODO: rethink these to use physical / logical dimensions, subsampling, offsets, etc.
+        assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
+        assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
+    }
+    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
+    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
+    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
+    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+    const int img_size=img_len*img_wid;
+    const int kern_size=kern_len*kern_wid;
+    const int out_size=out_len*out_wid;
+    const int img_size_byte = img_size*sizeof(float);
+    const int kern_size_byte = kern_size*sizeof(float);
+    //padded image sizes
+    const int img_wid_padded=img_wid+2*kern_wid-2;
+    const int img_len_padded=img_len+2*kern_len-2;
+    const int img_size_padded=img_len_padded * img_wid_padded;
+    const int img_size_padded_byte = img_size_padded*sizeof(float);
+    //const int out_size_byte = out_size*sizeof(float); // unused 
+    bool subsample = subsample_rows!=1 || subsample_cols!=1;
+    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
+    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
+    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
+    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
+    //allow to use a version that use less registers(so is faster)
+    //the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
+    bool kern_flipped=true;
+    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
+    float * kern_data_unflipped = kern->devdata;
+    int kern_stride_col_unflipped=kern_stride_col;
+    int kern_stride_row_unflipped=kern_stride_row;
+    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
+      //the last two dimensions are c_contiguous but flipped!
+      kern_stride_col_unflipped=1;
+      kern_stride_row_unflipped=kern_wid;
+      kern_flipped=false;
+      kern_contiguous_2d_unflipped = true;
+      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+    }
+    if (verbose>1)
+    {
+        printf("INFO: Running conv_full version %d with inputs:\n",version);
+        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
+                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
+        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
+    }
+    if (!subsample &&
+	out_contiguous &&
+	(version==3||version==4||version==5||version==-1) &&
+	out_wid<512 &&//Maximum of 512 threads by block
+	(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
+	!work_complete) //conv_full_patch_stack_padded
+    {
+      //version 3 without split
+      //version 4 with split (more registers)
+      //version 5 with split (more registers) low mem version(some restriction and still more register)
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+	if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
+	if(kern_len==1 && version==5){
+	  //version 5 don't support kern_len==1 as 1%0 return -1.
+	  version=-1;
+	  if(verbose)printf("WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
+	}
+	if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
+	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+	//Max of 16k of shared memory
+	if(version==5)
+	  while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
+	//327 as we use 25 register
+	//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
+	int max_thread = (version!=5?327:450);
+	while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
+	if(version==-1 && out_size>512)version=4;
+	if(version==-1)version=3;
+	if(version==-1 && nb_split>1) version=4;
+	else if(version==-1) version=3;
+	else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
+	assert(version!=3 || nb_split==1);
+	assert(version!=5 || kern_len>1);
+	assert(version!=-1);
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+        dim3 grid(nbatch,nkern);
+	int shared_size=img_size_padded_byte + kern_size_byte;
+	if(version==5)
+	  shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int);
+#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
+             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
+	else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
+	else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
+	else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
+	else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
+	else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
+	else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
+	else assert(false);
+	switch(kern_wid){
+#ifdef UNROLL_LOOP
+         case 1: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(1); break;
+         case 2: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(2); break;
+         case 3: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(3); break;
+         case 4: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(4); break;
+         case 5: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(5); break;//test_conv.py:test_full
+         case 6: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(6); break;//test_conv.py:test_full
+         case 7: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(7); break;//test_nnet.py:test_lenet_64
+         case 8: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(8); break;//test_conv.py:test_full
+         case 9: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(9); break;//test_nnet.py:test_lenet_256
+         case 10: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(10); break;//test_conv.py:test_full
+         case 12: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(12); break;//test_conv.py:test_full
+                  //////// Special cases
+         case 28: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(28); break;
+#endif
+                  //////// default case
+	 default:
+	   if(!msgdisplayed_conv_full_patch_stack__kern_width){
+	     printf("OPTIMISATION HINT: conv_full_patch_stack_padded template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+	     msgdisplayed_conv_full_patch_stack__kern_width = true;
+	   }
+           CONV_FULL_PATCH_STACK_PADDED_SPECIAL(0);
+	}
+	f<<< grid, threads, shared_size>>>
+	     (img->devdata, kern_data_unflipped, out->devdata,
+	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	      img_stride_col, img_stride_row, img_stride_stack,
+	      img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
+	      kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
+            if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
+            if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
+				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (!subsample && c_contiguous &&
+	(version==0||version==-1) &&
+	out_size<512 &&//Maximum of 512 theads by block
+	nstack == 1 &&// don't implement the stack in the kernel.
+	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+	!work_complete) //conv_full_patch
+    {
+        dim3 threads(out_wid, out_len);
+        dim3 grid(nbatch,nkern);
+        int shared_size=(img_size + kern_size)*sizeof(float);
+	//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
+	conv_full_patch<<< grid, threads, shared_size>>>
+	  (img->devdata,
+	   kern->devdata,
+	   out->devdata,
+	   img_len, img_wid,
+	   kern_len, kern_wid,
+	   nkern, nstack);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) printf("INFO: used 'conv_full_patch' version\n");
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (false && !subsample && //disabled as test fail for this kernel
+	(version==1||version==-1) &&
+	out_size<512 &&//Maximum of 512 theads by block
+        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
+	nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
+	!work_complete) //conv_full_load_everything
+    {
+        dim3 threads(out_wid, out_len);
+        dim3 grid(nbatch);
+        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
+	//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
+        //typeof(conv_full_load_everything<0>) f = ;
+	void (*f)(float*, float*, float*,
+		  int, int, int, int, int, int,
+		  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
+        switch(nstack)
+        {
+#ifdef UNROLL_LOOP
+            case 1: f = conv_full_load_everything<1>; break;
+            //case 10: f = conv_full_load_everything<10>; break;
+            //case 30: f = conv_full_load_everything<30>; break;  //This is actually slower than the general version??
+#endif
+	    default:
+	      printf("OPTIMISATION HINT: conv_full_load_everything template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
+	      f = conv_full_load_everything<0>;
+        };
+	f<<< grid, threads, shared_size>>>
+	  (img->devdata,
+	   kern->devdata,
+	   out->devdata,
+	   img_len, img_wid, 
+	   kern_len, kern_wid,
+	   nkern, nstack,
+           CudaNdarray_HOST_STRIDES(img)[3],
+           CudaNdarray_HOST_STRIDES(img)[2],
+           CudaNdarray_HOST_STRIDES(img)[1],
+           CudaNdarray_HOST_STRIDES(img)[0],
+           CudaNdarray_HOST_STRIDES(kern)[3],
+           CudaNdarray_HOST_STRIDES(kern)[2],
+           CudaNdarray_HOST_STRIDES(kern)[1],
+           CudaNdarray_HOST_STRIDES(kern)[0]
+           );
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (!subsample &&
+	img_batch_stack_contiguous &&
+	out_contiguous &&
+	(version==2||version==-1) &&
+	out_size<512 &&//Maximum of 512 theads by block
+	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+	!work_complete) //conv_full_patch_stack
+    {
+        dim3 threads(out_wid, out_len);
+        dim3 grid(nbatch,nkern);
+        int shared_size=(img_size + kern_size)*sizeof(float);
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int);
+        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
+        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
+        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
+        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
+        f<<< grid, threads, shared_size>>>(
+                img->devdata,
+                kern->devdata,
+                out->devdata,
+                img_len, img_wid,
+                kern_len, kern_wid,
+                nkern, nstack,img_stride_col, img_stride_row,
+                kern_stride_col, kern_stride_row,
+                kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) printf("INFO: used 'conv_full_patch_stack' version\n");
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
+            if (verbose) printf("INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
+                                cudaGetErrorString(sts));
+        }                         
+    }
+    if (1 && !work_complete) //conv_reference_full
+    {
+        if(verbose>1)printf("INFO: will start conv_reference_full\n");
+        int outsize = CudaNdarray_SIZE(out);
+        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
+        if (0)
+        {
+            if (verbose) printf("INFO: launching conv_reference_valid\n");
+            if (verbose) printf("      img : %i %i %i %i %p  %i %i %i %i\n",
+                    CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+                    img->devdata,
+                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
+            if (verbose) printf("      kern: %i %i %i %i %p  %i %i %i %i\n", 
+                    CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                    kern->devdata,
+                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
+                        );
+            if (verbose) printf("      out : %i %i %i %i %p  %i %i %i %i\n",
+                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+                    out->devdata,
+                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
+            if (verbose) printf("   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
+            if (verbose) printf("   subsample params: %i %i\n", subsample_rows, subsample_cols);
+        }
+        conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
+                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+                img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
+                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                subsample_rows, subsample_cols);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) printf("INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
+				img_len,img_wid, kern_len, kern_wid,
+				out_len, out_wid, nbatch, nkern, nstack, subsample);
+            work_complete = true;
+        }
+        else
+        {
+	  if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
+	  if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
+			      cudaGetErrorString(sts));
+	  PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
+		       cudaGetErrorString(sts));
+            return -1;
+        }
+    }
+    return 0;
+}
+PyObject * 
+CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
+		 CudaNdarray * out, const int mode,
+		 const int subsample_rows, const int subsample_cols,
+		 const int version, const int verbose)
+{
+    if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
+    if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
+    int out_dim[4];
+    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
+    out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
+    int logical_rows, logical_cols;
+    if (mode == ConvMode_VALID)
+    {
+        logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
+        logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
+    }
+    else
+    {
+        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
+    }
+    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
+    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
+    CudaNdarray * rval = out;
+    if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out) 
+	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
+	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
+	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
+	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
+      if (out)
+      {
+          fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n");
+	  Py_DECREF(out);
+      }
+      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+    }
+    if ((rval==NULL) 
+            || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
+            || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
+            )
+    {
+        // if rval is something we just allocated,
+        // and there was a problem, then we have to free it.
+        if (rval != out) Py_XDECREF(rval);
+        return NULL;
+    }
+    //TODO: Get refcount story clearer!
+    //      This function does a weird thing as work-around with Conv_VARARGS
+    if (rval == out) Py_INCREF(rval);
+    return (PyObject*)rval;
+}
--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
-#include"conv_kernel.cu"
 //we store the full image and the full kernel in the shared memory
 //each thread compute only one value for the output
 //thread block size=out_wid, out_len/nb_split

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -28,8 +28,6 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
 #ifndef CONV_KERNEL_CU
 #define CONV_KERNEL_CU
-#include <stdio.h>
 /*
 #define CHECK_BANK_CONFLICTS 0
 #if CHECK_BANK_CONFLICTS

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -9,19 +9,10 @@
 #define UNROLL_LOOP
 #endif
-#ifndef SHARED_SIZE 
-#define SHARED_SIZE (16*1024)
-#endif
 /////////////////////////
 // Static helper methods
 /////////////////////////
-template <typename T>
-static T ceil_intdiv(T a, T b)
-{
-    return (a/b) + ((a % b) ? 1: 0);
-}
 static void
 CudaNdarray_null_init(CudaNdarray*self)
 {
@@ -1380,92 +1371,6 @@ CudaNdarray_Dot(PyObject* _unsed, PyObject * args)
    Py_XDECREF(rval);
    return NULL;
 }
-static PyObject *
-CudaNdarray_Conv_VARARGS(PyObject * _unused, PyObject *args, PyObject * kwargs)
-{
-    //Mandatory args
-    PyObject *img = NULL;
-    PyObject *kern = NULL;
-    PyObject *mode_str = NULL;
-    //Optional args
-    PyObject *out = NULL;
-    PyObject *subsample = NULL;
-    PyObject *logical_img_shape = NULL;
-    PyObject *logical_kern_shape = NULL;
-    PyObject *kern_align = NULL;
-    int version = -1;
-    int verbose = 0;
-    // the output_downsampling arguments as integers
-    const int od_0_orig = 1;
-    const int od_1_orig = 1;
-    int od_0 = od_0_orig;
-    int od_1 = od_1_orig;
-    PyObject *out_2 = NULL;
-    static char *kwlist[] = {"img", "kern", "mode", "out", "subsample", "logical_img_shape", "logical_kern_shape", "kern_align", "version", "verbose", NULL };
-    if (! PyArg_ParseTupleAndKeywords(args, kwargs, "OOS|OOOOOii", kwlist,
-                &img, &kern, &mode_str, 
-                &out, &subsample, &logical_img_shape, &logical_kern_shape, &kern_align, &version, &verbose))
-        return NULL; 
-    int mode;
-    if (strcmp(PyString_AsString(mode_str), "full") == 0)
-    {
-        mode = ConvMode_FULL;
-    }
-    else if (strcmp(PyString_AsString(mode_str), "valid") == 0)
-    {
-        mode = ConvMode_VALID;
-    }
-    else
-    {
-        PyErr_SetString(PyExc_ValueError, "mode must be one of 'full' or 'valid'");
-        return NULL;
-    }
-    if (!CudaNdarray_Check(img))
-    {
-        PyErr_SetString(PyExc_TypeError, "img argument must be a CudaNdarray");
-        return NULL;
-    }
-    if (!CudaNdarray_Check(kern))
-    {
-        PyErr_SetString(PyExc_TypeError, "kern argument must be a CudaNdarray");
-        return NULL;
-    }
-    if (out && CudaNdarray_Check(out))
-    {
-        out_2 = out;
-    }
-    else if (out && Py_None != out)
-    {
-        fprintf(stderr, "Warning: Conv is ignoring 'out' argument that wasn't a CudaNdarray.\n");
-    }
-    if (subsample)
-    {
-        if ((!PySequence_Check(subsample))
-                || (PySequence_Length(subsample) != 2))
-        {
-            PyErr_SetString(PyExc_TypeError, "'subsample' argument must be a length-2 sequence of integers");
-            return NULL;
-        }
-        PyObject *py_od_0 = PySequence_GetItem(subsample, 0);
-        PyObject *py_od_1 = PySequence_GetItem(subsample, 1);
-        od_0 = PyInt_AsLong(py_od_0);
-        od_1 = PyInt_AsLong(py_od_1);
-        if (PyErr_Occurred())
-        {
-            od_0 = od_0_orig;
-            od_1 = od_1_orig;
-            Py_XDECREF(py_od_0);
-            Py_XDECREF(py_od_1);
-            PyErr_SetString(PyExc_TypeError, "'subsample' argument must be a length-2 sequence of integers");
-            return NULL;
-        }
-    }
-    return CudaNdarray_Conv((CudaNdarray*)img, (CudaNdarray*)kern, (CudaNdarray*)out_2, mode, od_0, od_1, version, verbose);
-}
 static PyObject * 
 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict)
@@ -1552,7 +1457,6 @@ filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, s
 static PyMethodDef module_methods[] = {
    {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
-    {"conv", (PyCFunction)CudaNdarray_Conv_VARARGS, METH_VARARGS|METH_KEYWORDS, "Returns the 2D convolution of one CudaNdarray argument with another. WRITEME"},
    {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
    {"filter", filter, METH_VARARGS, "no doc!"},    
    {NULL, NULL, NULL, NULL}  /* Sentinel */
@@ -2379,1273 +2283,3 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
    return 0;
 }
-#include <conv_kernel.cu>
-#include <conv_full_kernel.cu>
-bool msgdisplayed_conv_patch__kern_width = false;
-bool msgdisplayed_conv_patch_stack__kern_width = false;
-bool msgdisplayed_conv_rows__kern_width = false;
-bool msgdisplayed_conv_rows_stack__kern_width = false;
-bool msgdisplayed_conv_rows_stack2__kern_width = false;
-bool msgdisplayed_conv_patch_stack_reduce__kern_width = false;
-bool msgdisplayed_conv_full_patch_stack__kern_width = false;
-/*
- * version: -1, autodetect, >=0 a specific version to use.
- *          If it can't be executed, we revert to the reference implementation
- */
-int 
-CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
-		       CudaNdarray * out, int subsample_rows, int subsample_cols,
-		       int version = -1, int verbose=0)
-{
-    int work_complete = 0;
-    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
-    if (img->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (kern->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (out->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    if (subsample_rows==1 && subsample_cols==1)
-    {
-        //TODO: rethink these asserts in light of the difference between physical and logical dimensions
-        assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1);
-        assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1);
-    }
-    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
-    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
-    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
-    // we now search through a few implementations until one applies to our arguments.
-    //TODO: make separate version as if all fill this is slower. 
-    //TODO: Make a switch with power of 2 max size as template
-    //TODO: make a parameter the number of division
-    //TODO: Should we make them in separate grid block instead?
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
-    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
-    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
-    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
-    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    const int out_size_byte = out_size*sizeof(float);
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    }
-    if (verbose>1)
-    {
-        printf("INFO: Running conv_valid version %d with inputs:\n",version);
-        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
-        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
-    }
-    //if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
-    if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
-      //condition for exec 
-      if(!subsample &&
-	out_contiguous &&
-	out_size<512 &&//Maximum of 512 theads by block
-	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
-	!work_complete)
-	version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
-    }
-    if (!subsample && c_contiguous &&
-	(version==0||version==2||version==-1) &&
-	out_wid<512 &&//Maximum of 512 theads by block
-	nstack == 1 &&// don't implement the stack in the kernel.
-	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_patch
-    {
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-	while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch, nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int);
-#define CONV_PATCH_SPECIAL(kern_wid) \
-            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
-            else f=conv_patch<true,kern_wid,true>;
-	 switch(kern_wid){
-#ifdef UNROLL_LOOP
-	 case 1: CONV_PATCH_SPECIAL(1); break;//test_conv.py:test_valid
-	 case 2: CONV_PATCH_SPECIAL(2); break;//test_conv.py:test_valid
-	 case 3: CONV_PATCH_SPECIAL(3); break;//test_conv.py:test_valid
-	 case 4: CONV_PATCH_SPECIAL(4); break;
-	 case 5: CONV_PATCH_SPECIAL(5); break;
-	 case 6: CONV_PATCH_SPECIAL(6); break;
-	 case 7: CONV_PATCH_SPECIAL(7); break;
-	 case 10: CONV_PATCH_SPECIAL(10); break;
-#endif
-	 default:
-	   if(!msgdisplayed_conv_patch__kern_width) {
-	     printf("OPTIMISATION WARNING: conv_patch template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-	     msgdisplayed_conv_patch__kern_width=true;
-	   }
-	   CONV_PATCH_SPECIAL(0);
-	 }
-	 f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern->devdata, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) printf("INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split);
-            if (verbose) printf("INFO: impl 'conv_patch' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample &&
-	out_contiguous &&
-	(version==1||version==3||version==11||version==12||version==-1) &&
-	(version!=1 || out_size<512) &&//Maximum of 512 theads by block
-	out_wid<512 &&//Maximum of 512 theads by block
-	img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_patch_stack
-    {
-      //version 1 is without split and preload the full kernel
-      //version 3 is with split and preload the full kernel
-      //version 11 is without split and load only 1 kernel row at a time.
-      //version 12 is with split and load only 1 kernel row at a time.
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-	while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-	bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
-	if(version==11 || version==12) preload_full_kernel=false;
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
-#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-            if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;\
-            if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;\
-            if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;\
-            if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;\
-            if(preload_full_kernel && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;\
-            if(preload_full_kernel && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;\
-            if(preload_full_kernel && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;\
-            if(preload_full_kernel && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;\
-            if(nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;\
-            if(nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;\
-            if(nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;\
-            if(nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;\
-            if(img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;\
-            if(img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;\
-            if(!img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;\
-            if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,false>;
-	switch(kern_wid){
-#ifdef UNROLL_LOOP
-         case 1: CONV_PATCH_STACK_SPECIAL(1); break;
-         case 2: CONV_PATCH_STACK_SPECIAL(2); break;
-         case 3: CONV_PATCH_STACK_SPECIAL(3); break;
-         case 4: CONV_PATCH_STACK_SPECIAL(4); break;
-         case 5: CONV_PATCH_STACK_SPECIAL(5); break;
-         case 6: CONV_PATCH_STACK_SPECIAL(6); break;
-         case 7: CONV_PATCH_STACK_SPECIAL(7); break;
-         case 8: CONV_PATCH_STACK_SPECIAL(8); break;
-         case 9: CONV_PATCH_STACK_SPECIAL(9); break;
-         case 10: CONV_PATCH_STACK_SPECIAL(10); break;
-                  //////// Special cases
-         case 12: CONV_PATCH_STACK_SPECIAL(12); break;//on cifar10
-         case 21: CONV_PATCH_STACK_SPECIAL(21); break;//on cifar10
-         case 23: CONV_PATCH_STACK_SPECIAL(23); break;//test_nnet.py:test_lenet_64
-         case 24: CONV_PATCH_STACK_SPECIAL(24); break;//on cifar10
-         case 25: CONV_PATCH_STACK_SPECIAL(25); break;//on cifar10
-         case 28: CONV_PATCH_STACK_SPECIAL(28); break;
-         case 32: CONV_PATCH_STACK_SPECIAL(32); break;// Alex speed example
-         case 45: CONV_PATCH_STACK_SPECIAL(45); break;//used by test_nnet.py:test_lenet_108
-#endif
-                  //////// default case
-        default:
-            if(!msgdisplayed_conv_patch_stack__kern_width) {
-                printf("OPTIMISATION HINT: conv_patch_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-                msgdisplayed_conv_patch_stack__kern_width = true;
-            }
-            CONV_PATCH_STACK_SPECIAL(0);
-	}
-	f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern->devdata, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	      img_stride_col, img_stride_row, img_stride_stack,
-	      img_stride_batch, kern_stride_col, kern_stride_row,
-	      kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
-				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
-            if (verbose) printf("INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
-				nb_split,preload_full_kernel);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, nb_split=%i preload_full_kernel=%i\n",
-				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split, preload_full_kernel);
-            if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample && out_contiguous &&
-	(version==4||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	nstack == 1 &&// don't implement the stack in the kernel.
-	kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
-	!work_complete) //conv_rows
-    {
-        dim3 threads(out_wid);
-        dim3 grid(out_len, nbatch*nkern);
-        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
-#define CONV_ROWS_SPECIAL(kern_wid) \
-	if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
-	else f = conv_rows<kern_wid, true>;\
-	switch(kern_wid){
-#ifdef UNROLL_LOOP
-	case 1: CONV_ROWS_SPECIAL(1); break;//test_conv.py:test_valid
-	case 2: CONV_ROWS_SPECIAL(2); break;//test_conv.py:test_valid
-	case 3: CONV_ROWS_SPECIAL(3); break;//test_conv.py:test_valid
-	case 4: CONV_ROWS_SPECIAL(4); break;//test_conv.py:test_valid
-	case 5: CONV_ROWS_SPECIAL(5); break;//test_conv.py:test_valid
-	  //	case 6: CONV_ROWS_SPECIAL(6); break;
-	case 7: CONV_ROWS_SPECIAL(7); break;//used by test_nnet.py:test_lenet_108
-	  //	case 8: CONV_ROWS_SPECIAL(8); break;
-	case 9: CONV_ROWS_SPECIAL(9); break;//used by test_nnet.py:test_lenet_256
-	case 10: CONV_ROWS_SPECIAL(10); break;//test_conv.py:test_valid
-	  //////// Special cases
-	case 28: CONV_ROWS_SPECIAL(28); break;
-#endif
-	  //////// default case
-	default:
-	  if(!msgdisplayed_conv_rows__kern_width){
-	    printf("OPTIMISATION HINT: conv_rows template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-	    msgdisplayed_conv_rows__kern_width = true;
-	  }
-	  CONV_ROWS_SPECIAL(0);
-	}
-	f<<< grid, threads, shared_size >>>
-	  (img->devdata, kern->devdata, out->devdata,
-	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	   img_stride_col, img_stride_row,
-	   img_stride_stack,img_stride_batch,
-	   kern_stride_col, kern_stride_row,
-	   kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose) printf("INFO: used 'conv_rows' version\n");
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_rows' failed (%s), trying next implementation\n",
-                    cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample && out_contiguous &&
-	(version==5||version==-1) &&
-	out_wid<512 &&//Maximum of 512 theads by block
-	img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_rows_stack
-    {
-	int nb_row=1;
-	int max_threads=512;
-	//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
-	for(int i=2;i<=out_len;i++){
-	  if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
-	    nb_row=i;
-	}
-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
-#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
-	if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
-	else f = conv_rows_stack<kern_wid, true>;\
-	switch(kern_wid){
-#ifdef UNROLL_LOOP
-	case 1: CONV_ROWS_STACK_SPECIAL(1); break;//test_conv.py:test_valid
-	case 2: CONV_ROWS_STACK_SPECIAL(2); break;
-	case 3: CONV_ROWS_STACK_SPECIAL(3); break;//test_conv.py:test_valid
-	case 4: CONV_ROWS_STACK_SPECIAL(4); break;//test_conv.py:test_valid
-	case 5: CONV_ROWS_STACK_SPECIAL(5); break;//test_conv.py:test_valid
-	case 6: CONV_ROWS_STACK_SPECIAL(6); break;//test_conv.py:test_valid
-	case 7: CONV_ROWS_STACK_SPECIAL(7); break;//test_nnet.py:test_lenet_108
-	case 8: CONV_ROWS_STACK_SPECIAL(8); break;//test_conv.py:test_valid
-	case 9: CONV_ROWS_STACK_SPECIAL(9); break;//test_nnet.py:test_lenet_256
-	case 10: CONV_ROWS_STACK_SPECIAL(10); break;//test_conv.py:test_valid
-	  //////// Special cases
-	case 23: CONV_ROWS_STACK_SPECIAL(23); break;//test_conv.py:test_valid
-	case 24: CONV_ROWS_STACK_SPECIAL(24); break;//test_conv.py:test_valid
-	case 28: CONV_ROWS_STACK_SPECIAL(28); break;//test_conv.py:test_valid
-	case 45: CONV_ROWS_STACK_SPECIAL(45); break;//test_nnet.py:test_lenet_64
-	case 102: CONV_ROWS_STACK_SPECIAL(102); break;//test_nnet.py:test_lenet_108
-#endif
-	  //////// default case
-	default:
-	  if(!msgdisplayed_conv_rows_stack__kern_width){
-	    printf("OPTIMISATION HINT: conv_rows_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-	    msgdisplayed_conv_rows_stack__kern_width = true;
-	  }
-	  CONV_ROWS_STACK_SPECIAL(0);
-	}
-	f<<< grid, threads, shared_size >>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	   img_stride_col, img_stride_row,
-	   img_stride_stack,img_stride_batch,
-	   kern_stride_col, kern_stride_row,
-	   kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-	    if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_rows_stack' version\n");
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n",
-		     cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample && out_contiguous &&
-	(version==9||version==10||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
-	(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
-	!work_complete) //conv_rows_stack2
-    {
-	int nb_row=1;
-	int max_threads=512;
-	int version_back = version;
-	//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
-	if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
-	  version = 9;
-	else if(version==-1)version = 10;
-	int k_size = kern_size;
-	if(version==10)
-	  k_size=kern_wid;
-	for(int i=2;i<=out_len;i++){
-	  if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
-	    nb_row=i;
-	}
-	//to test the case when we don't have a thread by output pixel.
-	if((version_back!=-1)&& nb_row>1) nb_row--;
-        dim3 threads(out_wid,nb_row);
-        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
-#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
-	if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
-	else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
-	else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-	else f = conv_rows_stack2<kern_wid, true, false>;\
-	switch(kern_wid){
-#ifdef UNROLL_LOOP
-	case 1: CONV_ROWS_STACK2_SPECIAL(1); break;//test_conv.py:test_valid
-	case 2: CONV_ROWS_STACK2_SPECIAL(2); break;
-	case 3: CONV_ROWS_STACK2_SPECIAL(3); break;//test_conv.py:test_valid
-	case 4: CONV_ROWS_STACK2_SPECIAL(4); break;//test_conv.py:test_valid
-	case 5: CONV_ROWS_STACK2_SPECIAL(5); break;//test_conv.py:test_valid
-	case 6: CONV_ROWS_STACK2_SPECIAL(6); break;//test_conv.py:test_valid
-	case 7: CONV_ROWS_STACK2_SPECIAL(7); break;//test_nnet.py:test_lenet_108
-	case 8: CONV_ROWS_STACK2_SPECIAL(8); break;//test_conv.py:test_valid
-	  //	case 9: CONV_ROWS_STACK2_SPECIAL(9); break;
-	case 10: CONV_ROWS_STACK2_SPECIAL(10); break;//test_conv.py:test_valid
-	  //////// Special cases
-	case 23: CONV_ROWS_STACK2_SPECIAL(23); break;//test_conv.py:test_valid
-	case 24: CONV_ROWS_STACK2_SPECIAL(24); break;//test_conv.py:test_valid
-	case 28: CONV_ROWS_STACK2_SPECIAL(28); break;//test_conv.py:test_valid
-	case 45: CONV_ROWS_STACK2_SPECIAL(45); break;//test_nnet.py:test_lenet_108
-	case 58: CONV_ROWS_STACK2_SPECIAL(58); break;//test_nnet.py:test_lenet_108
-	case 70: CONV_ROWS_STACK2_SPECIAL(70); break;//mobahi_2009.py
-	case 102: CONV_ROWS_STACK2_SPECIAL(102); break;//test_nnet.py:test_lenet_108
-	case 116: CONV_ROWS_STACK2_SPECIAL(116); break;//test_nnet.py:test_lenet_256
-	case 248: CONV_ROWS_STACK2_SPECIAL(248); break;//test_nnet.py:test_lenet_256
-#endif
-	  //////// default case
-	default:
-	  if(!msgdisplayed_conv_rows_stack2__kern_width){
-	    printf("OPTIMISATION HINT: conv_rows_stack{2,3} template default add"
-		   " kern_wid=%d in %s at line %i to have an optimized version"
-		   " for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-	    msgdisplayed_conv_rows_stack2__kern_width = true;
-	  }
-	  CONV_ROWS_STACK2_SPECIAL(0);
-	}
-	f<<< grid, threads, shared_size >>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	   img_stride_col, img_stride_row,
-	   img_stride_stack,img_stride_batch,
-	   kern_stride_col, kern_stride_row,
-	   kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-	    if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n",
-				  threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
-				threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3));
-            if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
-		     cudaGetErrorString(sts));
-        }                         
-    }
-    //version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
-    //version 8 need a minimum of kernel length as we force the split.
-    //version 8 is needed to test more easily this kernel template parameter.
-    //version 13 load only 1 kernel row at a time.
-    if (!subsample &&
-	out_contiguous &&
-	out_size<512 &&//Maximum of 512 theads by block
-	(version==7||version==8||version==13||version==-1) &&
-	(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
-	(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
-	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
-	!work_complete) //conv_patch_stack_reduce
-    {
-	int nb_split=1;
-	int full_kern=true;
-	if(version==8||version==13) nb_split++;//force the split.
-	if(version==13)full_kern=false;
-	while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
-	while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
-	int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
-	if(shared_size>=shared_avail){
-	  //if we can't fit the kernel in shared memory, we can split it more.
-	  full_kern=false;	  
-	  assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
-	  shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
-	  while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
-	    nb_split++;
-	    shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
-	  }
-	}
-	int thread_z=ceil_intdiv(kern_len,nb_split);
-        assert(thread_z>0);//should not happen, but in case...
-	assert(shared_size<=shared_avail);
-	if(!full_kern)
-	  assert(thread_z!=kern_len);
-        dim3 threads(out_wid, out_len, thread_z);
-        dim3 grid(nbatch,nkern);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int,
-		  int, int,
-		  int, int);
-	const bool split=thread_z!=kern_len;
-	const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
-	//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
-	//We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-            if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-            else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-            else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-            else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-            else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-            else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-            else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-            else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-	    /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-	    /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-            else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-            else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-            /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-            /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-            else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-            else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-	switch(kern_wid){
-#ifdef UNROLL_LOOP
-         case 1: CONV_PATCH_STACK_REDUCE_SPECIAL(1); break;
-         case 2: CONV_PATCH_STACK_REDUCE_SPECIAL(2); break;
-         case 3: CONV_PATCH_STACK_REDUCE_SPECIAL(3); break;
-         case 4: CONV_PATCH_STACK_REDUCE_SPECIAL(4); break;
-         case 5: CONV_PATCH_STACK_REDUCE_SPECIAL(5); break;
-         case 6: CONV_PATCH_STACK_REDUCE_SPECIAL(6); break;
-         case 7: CONV_PATCH_STACK_REDUCE_SPECIAL(7); break;
-         case 8: CONV_PATCH_STACK_REDUCE_SPECIAL(8); break;
-         case 9: CONV_PATCH_STACK_REDUCE_SPECIAL(9); break;
-         case 10: CONV_PATCH_STACK_REDUCE_SPECIAL(10); break;
-                  //////// Special cases
-         case 20: CONV_PATCH_STACK_REDUCE_SPECIAL(20); break;
-         case 23: CONV_PATCH_STACK_REDUCE_SPECIAL(23); break;//test_nnet.py:test_lenet64
-         case 24: CONV_PATCH_STACK_REDUCE_SPECIAL(24); break;
-         case 28: CONV_PATCH_STACK_REDUCE_SPECIAL(28); break;
-         case 32: CONV_PATCH_STACK_REDUCE_SPECIAL(32); break;//Alex speed demonstration
-#endif
-                  //////// default case
-        default:
-            if(!msgdisplayed_conv_patch_stack_reduce__kern_width) {
-                printf("OPTIMISATION HINT: conv_patch_stack_reduce template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-                msgdisplayed_conv_patch_stack_reduce__kern_width = true;
-            }
-            CONV_PATCH_STACK_REDUCE_SPECIAL(0);
-	}
-	if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version nb_split=%d, preload_full_kern=%d\n",
-			    nb_split,full_kern);
-	if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
-			      threads.x, threads.y, threads.z, grid.x, grid.y,
-			      shared_size, threads.x * threads.y * threads.z);
-	f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
-					   img_len, img_wid, kern_len, kern_wid,
-					   nkern, nstack, 
-					   img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
-					   kern_stride_col_unflipped, kern_stride_row_unflipped,
-					   kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
-            if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (1 && (version==6||version==-1) &&
-	!work_complete) //conv_valid_row_reduce
-    {
-        int outsize = CudaNdarray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-	int block_nstack=nstack;
-	//Max of 512 threads per blocks.
-	//On old hardware, we have a max of 356 threads as we have only 
-	//8k registers and the kernel use 23 register
-	//TODO: check if we have 8k or 16k of register...
-	while(block_nstack*kern_len>320)block_nstack--;
-        dim3 n_threads(block_nstack, kern_len, 1);
-        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
-        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
-         *
-         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
-         * */
-        int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
-        if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
-            initial_reduce_boundary >>= 1;
-        if (n_reduce_buf == sizeof(float))
-            assert (initial_reduce_boundary == 0);
-        else
-        {
-            assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
-            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
-        }
-	void (*f)(int, int, int, int,
-		  int, int, int, int, int,
-		  float*, int, int, int, int,
-		  float*, int, int, int, int,
-		  float*, int, int, int, int,
-		  int, int, int);
-        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
-        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
-        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
-	if(block_nstack==nstack)
-	  f=conv_valid_row_reduce<false>;
-	else
-	  f=conv_valid_row_reduce<true>;
-	f<<<n_blocks, n_threads, n_reduce_buf>>>(
-                nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                img->devdata,
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], 
-                img_stride_row, img_stride_col,
-                kern->devdata,
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
-                CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
-                out->devdata,
-                CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
-                CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
-                subsample_rows, subsample_cols, initial_reduce_boundary);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose) printf("INFO: used 'conv_valid_row_reduce' version\n");
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y);
-            if (verbose) printf("INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n",
-                    cudaGetErrorString(sts));
-        }
-    }
-    if (1 && !work_complete) //conv_reference_valid
-    {
-        int outsize = CudaNdarray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
-        if (0)
-        {
-            if (verbose) printf("INFO: launching conv_reference_valid\n");
-            if (verbose) printf("      img : %i %i %i %i %p  %i %i %i %i\n",
-                    nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
-                    img->devdata,
-                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
-            if (verbose) printf("      kern: %i %i %i %i %p  %i %i %i %i\n", 
-                    nkern, nstack, kern_len, kern_wid,
-                    kern->devdata,
-                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
-                        );
-            if (verbose) printf("      out : %i %i %i %i %p  %i %i %i %i\n",
-                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
-                    out->devdata,
-                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
-            if (verbose) printf("   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
-        }
-        conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
-                img_len, img_wid,
-                kern_len, kern_wid,
-                out_len, out_wid,
-                img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
-                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
-                out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
-                subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            work_complete = true;
-            if (verbose) printf("INFO: used 'conv_reference_valid' version\n");
-        }
-        else
-        {
-            PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
-                    cudaGetErrorString(sts));
-            return -1;
-        }
-    }
-    return 0;
-            //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
-            //return -1;
-}
-int 
-CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
-{
-    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
-    int work_complete = 0;
-    if (img->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (kern->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (out->nd != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    if (0)
-    {
-        //TODO: rethink these to use physical / logical dimensions, subsampling, offsets, etc.
-        assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
-        assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
-    }
-    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
-    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
-    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
-    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
-    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
-    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
-    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
-    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
-    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
-    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
-    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
-    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
-    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
-    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
-    const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
-    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
-    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
-    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
-    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
-    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    //padded image sizes
-    const int img_wid_padded=img_wid+2*kern_wid-2;
-    const int img_len_padded=img_len+2*kern_len-2;
-    const int img_size_padded=img_len_padded * img_wid_padded;
-    const int img_size_padded_byte = img_size_padded*sizeof(float);
-    //const int out_size_byte = out_size*sizeof(float); // unused 
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
-    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
-    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    float * kern_data_unflipped = kern->devdata;
-    int kern_stride_col_unflipped=kern_stride_col;
-    int kern_stride_row_unflipped=kern_stride_row;
-    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_stride_col_unflipped=1;
-      kern_stride_row_unflipped=kern_wid;
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    }
-    if (verbose>1)
-    {
-        printf("INFO: Running conv_full version %d with inputs:\n",version);
-        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
-                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
-                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]);
-        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3],
-                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]);
-    }
-    if (!subsample &&
-	out_contiguous &&
-	(version==3||version==4||version==5||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_full_patch_stack_padded
-    {
-      //version 3 without split
-      //version 4 with split (more registers)
-      //version 5 with split (more registers) low mem version(some restriction and still more register)
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
-	if(kern_len==1 && version==5){
-	  //version 5 don't support kern_len==1 as 1%0 return -1.
-	  version=-1;
-	  if(verbose)printf("WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
-	}
-	if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-	//Max of 16k of shared memory
-	if(version==5)
-	  while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
-	//327 as we use 25 register
-	//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
-	int max_thread = (version!=5?327:450);
-	while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-	if(version==-1 && out_size>512)version=4;
-	if(version==-1)version=3;
-	if(version==-1 && nb_split>1) version=4;
-	else if(version==-1) version=3;
-	else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
-	assert(version!=3 || nb_split==1);
-	assert(version!=5 || kern_len>1);
-	assert(version!=-1);
-        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
-        dim3 grid(nbatch,nkern);
-	int shared_size=img_size_padded_byte + kern_size_byte;
-	if(version==5)
-	  shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
-#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
-             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
-	else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
-	else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
-	else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
-	else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
-	else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
-	else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
-	else assert(false);
-	switch(kern_wid){
-#ifdef UNROLL_LOOP
-         case 1: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(1); break;
-         case 2: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(2); break;
-         case 3: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(3); break;
-         case 4: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(4); break;
-         case 5: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(5); break;//test_conv.py:test_full
-         case 6: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(6); break;//test_conv.py:test_full
-         case 7: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(7); break;//test_nnet.py:test_lenet_64
-         case 8: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(8); break;//test_conv.py:test_full
-         case 9: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(9); break;//test_nnet.py:test_lenet_256
-         case 10: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(10); break;//test_conv.py:test_full
-         case 12: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(12); break;//test_conv.py:test_full
-                  //////// Special cases
-         case 28: CONV_FULL_PATCH_STACK_PADDED_SPECIAL(28); break;
-#endif
-                  //////// default case
-	 default:
-	   if(!msgdisplayed_conv_full_patch_stack__kern_width){
-	     printf("OPTIMISATION HINT: conv_full_patch_stack_padded template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-	     msgdisplayed_conv_full_patch_stack__kern_width = true;
-	   }
-           CONV_FULL_PATCH_STACK_PADDED_SPECIAL(0);
-	}
-	f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern_data_unflipped, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	      img_stride_col, img_stride_row, img_stride_stack,
-	      img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
-	      kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose>1) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
-            if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false"));
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
-            if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n",
-				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample && c_contiguous &&
-	(version==0||version==-1) &&
-	out_size<512 &&//Maximum of 512 theads by block
-	nstack == 1 &&// don't implement the stack in the kernel.
-	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_full_patch
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-	//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-	conv_full_patch<<< grid, threads, shared_size>>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid,
-	   kern_len, kern_wid,
-	   nkern, nstack);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) printf("INFO: used 'conv_full_patch' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (false && !subsample && //disabled as test fail for this kernel
-	(version==1||version==-1) &&
-	out_size<512 &&//Maximum of 512 theads by block
-        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
-	nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
-	!work_complete) //conv_full_load_everything
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch);
-        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
-	//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-        //typeof(conv_full_load_everything<0>) f = ;
-	void (*f)(float*, float*, float*,
-		  int, int, int, int, int, int,
-		  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
-        switch(nstack)
-        {
-#ifdef UNROLL_LOOP
-            case 1: f = conv_full_load_everything<1>; break;
-            //case 10: f = conv_full_load_everything<10>; break;
-            //case 30: f = conv_full_load_everything<30>; break;  //This is actually slower than the general version??
-#endif
-	    default:
-	      printf("OPTIMISATION HINT: conv_full_load_everything template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
-	      f = conv_full_load_everything<0>;
-        };
-	f<<< grid, threads, shared_size>>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid, 
-	   kern_len, kern_wid,
-	   nkern, nstack,
-           CudaNdarray_HOST_STRIDES(img)[3],
-           CudaNdarray_HOST_STRIDES(img)[2],
-           CudaNdarray_HOST_STRIDES(img)[1],
-           CudaNdarray_HOST_STRIDES(img)[0],
-           CudaNdarray_HOST_STRIDES(kern)[3],
-           CudaNdarray_HOST_STRIDES(kern)[2],
-           CudaNdarray_HOST_STRIDES(kern)[1],
-           CudaNdarray_HOST_STRIDES(kern)[0]
-           );
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (!subsample &&
-	img_batch_stack_contiguous &&
-	out_contiguous &&
-	(version==2||version==-1) &&
-	out_size<512 &&//Maximum of 512 theads by block
-	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_full_patch_stack
-    {
-        dim3 threads(out_wid, out_len);
-        dim3 grid(nbatch,nkern);
-        int shared_size=(img_size + kern_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int);
-        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
-        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
-        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
-        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
-        f<<< grid, threads, shared_size>>>(
-                img->devdata,
-                kern->devdata,
-                out->devdata,
-                img_len, img_wid,
-                kern_len, kern_wid,
-                nkern, nstack,img_stride_col, img_stride_row,
-                kern_stride_col, kern_stride_row,
-                kern_stride_stack, kern_stride_nkern);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) printf("INFO: used 'conv_full_patch_stack' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y);
-            if (verbose) printf("INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
-                                cudaGetErrorString(sts));
-        }                         
-    }
-    if (1 && !work_complete) //conv_reference_full
-    {
-        if(verbose>1)printf("INFO: will start conv_reference_full\n");
-        int outsize = CudaNdarray_SIZE(out);
-        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
-        int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK);
-        if (0)
-        {
-            if (verbose) printf("INFO: launching conv_reference_valid\n");
-            if (verbose) printf("      img : %i %i %i %i %p  %i %i %i %i\n",
-                    CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
-                    img->devdata,
-                    CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]);
-            if (verbose) printf("      kern: %i %i %i %i %p  %i %i %i %i\n", 
-                    CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
-                    kern->devdata,
-                    CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3]
-                        );
-            if (verbose) printf("      out : %i %i %i %i %p  %i %i %i %i\n",
-                    CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
-                    out->devdata,
-                    CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]);
-            if (verbose) printf("   launch params: %i %i %i\n", outsize, n_blocks, n_threads);
-            if (verbose) printf("   subsample params: %i %i\n", subsample_rows, subsample_cols);
-        }
-        conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1],
-                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
-                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
-                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
-                img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3],
-                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
-                out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
-                subsample_rows, subsample_cols);
-        CNDA_THREAD_SYNC;
-        cudaError_t sts = cudaGetLastError();
-        if (cudaSuccess == sts) 
-        {
-            if (verbose) printf("INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n",
-				img_len,img_wid, kern_len, kern_wid,
-				out_len, out_wid, nbatch, nkern, nstack, subsample);
-            work_complete = true;
-        }
-        else
-        {
-	  if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads);
-	  if (verbose) printf("INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n",
-			      cudaGetErrorString(sts));
-	  PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed! (%s)",
-		       cudaGetErrorString(sts));
-            return -1;
-        }
-    }
-    return 0;
-}
-PyObject * 
-CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern,
-		 CudaNdarray * out, const int mode,
-		 const int subsample_rows, const int subsample_cols,
-		 const int version, const int verbose)
-{
-    if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
-    if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
-    int out_dim[4];
-    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
-    out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
-    int logical_rows, logical_cols;
-    if (mode == ConvMode_VALID)
-    {
-        logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
-        logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
-    }
-    else
-    {
-        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
-        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
-    }
-    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
-    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-    CudaNdarray * rval = out;
-    if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out) 
-	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
-	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
-	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
-	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
-      if (out)
-      {
-          fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n");
-      }
-      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
-    }
-    if ((rval==NULL) 
-            || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
-            || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
-            )
-    {
-        // if rval is something we just allocated,
-        // and there was a problem, then we have to free it.
-        if (rval != out) Py_XDECREF(rval);
-        return NULL;
-    }
-    //TODO: Get refcount story clearer!
-    //      This function does a weird thing as work-around with Conv_VARARGS
-    if (rval == out) Py_INCREF(rval);
-    return (PyObject*)rval;
-}
--- a/theano/sandbox/cuda/cuda_ndarray.cuh
+++ b/theano/sandbox/cuda/cuda_ndarray.cuh
@@ -26,6 +26,16 @@ typedef float real;
 #endif
+#ifndef SHARED_SIZE 
+#define SHARED_SIZE (16*1024)
+#endif
+template <typename T>
+static T ceil_intdiv(T a, T b)
+{
+    return (a/b) + ((a % b) ? 1: 0);
+}
 /**
 * struct CudaNdarray
 *
@@ -408,14 +418,6 @@ int CudaNdarray_reduce_max(CudaNdarray * self, CudaNdarray * A);
 int CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern);
-enum { ConvMode_FULL, ConvMode_VALID };
-PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
-PyObject * CudaNdarray_Conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode)
-{
-    return CudaNdarray_Conv(img, kern, out, mode, 1, 1, -1, 0);
-}
-int CudaNdarray_conv(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, const int mode);
 void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
 {
    fprintf(fd, "CudaNdarray <%p, %p> nd=%i \n", self, self->devdata, self->nd);