added disable the generation of conv_full_patch_stack_padded directly in GpuConv…

added disable the generation of conv_full_patch_stack_padded directly in GpuConv to test the hard coding of more parameter.

added disable the generation of conv_full_patch_stack_padded directly in GpuConv…
7f65a401 · Frederic Bastien · 5c3db8e4 · 7f65a401
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -194,6 +194,612 @@ class GpuConv(Op):
                kern_align=self.logical_kern_align_top,
                version=self.version,
                verbose=self.verbose)
+    def c_support_code_apply(self, node, nodename):
+        img_wid = self.logical_img_hw[1]
+        img_len = self.logical_img_hw[0]
+        kern_wid=self.logical_kern_hw[1]
+        kern_len=self.logical_kern_hw[0]
+        return"""
+const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
+__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
+  if (nb_thread < 64)
+    {
+      if(flipped) 
+        //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
+        for(int i=thread_id;i<N;i+=nb_thread)
+          dst[i]=src[N - 1 - i];
+        //dst[N-1-i]=src[i];
+      else
+        for(int i=thread_id;i<N;i+=nb_thread)
+          dst[i]=src[i];
+    }
+  else
+    {
+      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
+      // Global memory:
+      //  <-------------------------------------->
+      //      A      A      A      A      A   // points of 128-bit alignment
+      //         dddddddddddddddddddddd       // layout of src in global memory
+      //      |--|                            // masked_src_offset
+      // 
+      if (thread_id < nb_thread)
+        {
+          const int masked_src_offset = MASKED_OFFSET(src);
+          for(int masked_i=thread_id; masked_i<N + masked_src_offset; masked_i+=nb_thread)
+            {
+              int i = masked_i - masked_src_offset;
+              if (i >= 0)
+                if (flipped)
+                  dst[N-1-i] = src[i];
+                else
+                  dst[i]=src[i];
+            }
+        }
+    }
+}
+/*
+ * We load from global memory to shared memory. The outer if is optimized away at compilation.
+ */
+__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
+			       int nb_thread, const int nb_col, const int nb_row,
+			       const int stride_col, const int stride_row,
+			       const bool flipped=false, const bool c_contiguous=true){
+  if(flipped && ! c_contiguous){
+    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+      dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%%nb_col*stride_col];
+  }else if(c_contiguous){
+    load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
+  }else if(flipped){//c_contiguous==true
+    //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
+    int N=nb_col*nb_row;
+    for(int i=thread_id;i<N;i+=nb_thread)
+      dst[i]=src[N - 1 - i];
+    //dst[N-1-i]=src[i];
+  }else if(c_contiguous){//flipped==false
+    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread)
+      dst[i]=src[i];
+  }else{ // !flipped && !c_contiguous
+    /*
+    for(int i=thread_id;i<nb_row;i+=nb_thread){
+      float* s=&src[i*stride_row];
+      float* d=&dst[i*nb_col];
+      for(int j=thread_id;j<nb_col;i+=nb_thread)
+	//	dst[i*nb_col+j]=src[i*stride_row+j*stride_col];//dst[i]=src[i];
+	d[j]=s[j*stride_col];
+	}*/
+    /* We don't do this if as nvcc 2.3 take 2 more registers when we add the if
+       Why it do this?
+    if(stride_col==1 && stride_row==nb_col)
+      for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+	dst[i]=src[i];
+	else*/
+      for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+	dst[i]=src[i/nb_col*stride_row+i%%nb_col*stride_col];
+  }
+}
+__device__ void load_padded_col_to_shared(float * dst, const float * src, 
+					  const int thread_id, const int nb_thread,
+					  const int nb_col, const int nb_row, 
+					  const int stride_col, const int stride_row,
+					  const int wid_pad, const bool c_contiguous=true){
+  if(c_contiguous){//flipped==false
+    //template nb_col to have better performance!
+    int row=0;
+    int col=thread_id;
+    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread, col+=nb_thread){
+      col-=nb_col;row++;
+      while(col>nb_col){
+        col-=nb_col;row++;
+      }
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
+    }
+/*    
+    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
+      int col=i%%nb_col;
+      int row=i/nb_col;
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
+    }
+*/
+  }else{
+    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
+      int col=i%%nb_col;
+      int row=i/nb_col;
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
+    }
+  }
+}
+template<int i> __device__ float convolutionRowNoFlip(const float *data,
+						      const float *kern){
+  return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
+}
+template<> __device__ float convolutionRowNoFlip<0>(const float *data,
+						    const float *kern){
+    return 0;
+}
+template<int KERN_WIDTH>
+__device__ void convolutionRowNoFlip(float& sum,
+				     const float *data,
+				     const float *kern, const int kern_wid){
+  if(KERN_WIDTH>0)
+    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
+  else
+#pragma unroll 8
+    for (int col=0; col < kern_wid; col++) {//loop over col
+      sum+=data[col]*kern[col];
+    }
+}
+__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
+  for(int i=thread_id;i<N;i+=nb_thread)
+    dst[i]=value;
+}
+template <typename T>
+static T ceil_intdiv(T a, T b)
+{
+    return (a/b) + ((a %% b) ? 1: 0);
+}
+/**
+ * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
+ * I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
+ * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
+ * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
+ * template split: if true, each thread compute more then 1 output pixel.
+ * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
+ *          if you set split and low_mem to true, we will use the low_mem version!
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
+__global__ void
+conv_full_patch_stack_padded( float* img, float* kern, float* out,
+		  const int img_len, const int img_wid,
+		  const int kern_len, const int kern_wid,
+		  const int nkern, const int nstack,
+		  const int img_stride_col, const int img_stride_row,
+		  const int img_stride_stack, const int img_stride_batch,
+		  const int kern_stride_col, const int kern_stride_row,
+		  const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = %(img_len)s + %(kern_len)s - 1;
+  out_wid = %(img_wid)s + %(kern_wid)s - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  extern __shared__ float s_data[];
+    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
+    batch_id = blockIdx.x;
+    kern_id = blockIdx.y;
+    nb_rows = blockDim.y;
+    // Thread index
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+    int out_col = tx;//output col
+    const int thread_id  = ty*blockDim.x + tx;
+    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
+    float * d_img=&s_data[%(kern_len)s*%(kern_wid)s];//size of [see fct doc];
+    kern+=kern_stride_nkern*kern_id;//the good nkern
+    img+=img_stride_batch*batch_id;//the good batch
+    img_wid_valid=%(img_wid)s+2*%(kern_wid)s-2;
+    if(!split && !low_mem){
+      fill(d_img,img_wid_valid*(%(img_len)s+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
+      const int out_row = ty;//output row
+      float sum = 0.0f;
+      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+	     img+=img_stride_stack){
+	  __syncthreads();
+	load_padded_col_to_shared(d_img+img_wid_valid*(%(kern_len)s-1),img,
+				  thread_id,nb_thread_id,%(img_wid)s,%(img_len)s,
+				  img_stride_col, img_stride_row, %(kern_wid)s-1,
+				  c_contiguous);
+	load_to_shared(d_kern, kern, thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
+		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	__syncthreads();
+	for (int row=0; row < %(kern_len)s; row++) {//loop over row
+	  const float* idx_kern=&d_kern[row*%(kern_wid)s];
+	  const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+	  convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
+	}
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+	  kern_id*out_wid*out_len+//the output image
+	  out_row*out_wid+out_col] = sum;
+    }else if(split && !low_mem){
+      fill(d_img,img_wid_valid*(%(img_len)s+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
+      __shared__ int out_len_max;
+      //TODO pass a parameter nb_split
+      out_len_max = (out_len/blockDim.y+(out_len %% blockDim.y==0?0:1))*blockDim.y;
+      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
+	float sum = 0.0f;
+	for (int stack = 0;stack<nstack;stack++){
+	  __syncthreads();
+	  //TODO: load only the part of the image needed or put the partial result in shared memory
+	  load_padded_col_to_shared(d_img+img_wid_valid*(%(kern_len)s-1),
+				    img+img_stride_stack*stack,
+				    thread_id,nb_thread_id,%(img_wid)s,%(img_len)s,
+				    img_stride_col, img_stride_row, %(kern_wid)s-1,
+				    c_contiguous);
+	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
+			 thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
+			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	  __syncthreads();
+	  for (int row=0; row < %(kern_len)s; row++) {//loop over row
+	    const float* idx_kern=&d_kern[row*%(kern_wid)s];
+	    const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
+	  }
+	  if(out_row<out_len)
+	    out[batch_id*out_wid*out_len*nkern+//the good batch
+		out_wid*out_len*kern_id+//the output image
+		out_row*out_wid+out_col] = sum;
+	}
+      }
+    }else{//low_mem version
+      //don't need to fill the last rows padding as this is done later.
+      fill(d_img,img_wid_valid*((%(kern_len)s+nb_rows-1)+2*%(kern_len)s-2), 0, thread_id, nb_thread_id);
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
+      __shared__ int out_len_max;
+      //TODO pass a parameter nb_split
+      if(thread_id==0)
+	out_len_max = (out_len/nb_rows+(out_len %% nb_rows==0?0:1))*nb_rows;
+      __syncthreads();
+      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
+	  out_row+=nb_rows, out_row_iter++){
+	float sum = 0.0f;
+	for (int stack = 0;stack<nstack;stack++){
+	  __syncthreads();
+	  const int len_to_load=min(%(kern_len)s+nb_rows,%(img_len)s-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
+	  const int empty_row = max(%(kern_len)s-1-out_row_iter*nb_rows,0);//number of empty row at the start
+	  //we need to reload some row as when we change of out_row we lost the last load du to the stack.
+	  const int previous_row = min(out_row_iter*nb_rows,%(kern_len)s-1);//number of row from last out_row iteration to reload
+	  load_padded_col_to_shared(d_img+(%(kern_len)s-1-previous_row)*img_wid_valid,
+				    img+img_stride_stack*stack//the good stack image
+				    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
+				    thread_id,nb_thread_id,%(img_wid)s,
+				    len_to_load+previous_row,
+				    img_stride_col, img_stride_row, %(kern_wid)s-1,
+				    c_contiguous);
+	  __syncthreads();
+	  //TODO: fill the last row padding only when needed.
+	  //We always fill the last rows padding event when not needed.
+	  int row_to_fill = 2*%(kern_len)s-2+nb_rows- empty_row - previous_row - len_to_load;
+	  row_to_fill = min(row_to_fill,%(kern_len)s-1);
+	  fill(d_img+(%(kern_len)s-1+len_to_load)*img_wid_valid,
+	       img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
+	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
+			 thread_id, nb_thread_id, %(kern_wid)s,%(kern_len)s,
+			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+	  __syncthreads();
+	  for (int row=0; row < %(kern_len)s; row++) {//loop over row
+	    const float* idx_kern=&d_kern[row*%(kern_wid)s];
+	    const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
+	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, %(kern_wid)s);
+	  }
+	}
+	if(out_row<out_len)
+	  out[batch_id*out_wid*out_len*nkern+//the good batch
+	      out_wid*out_len*kern_id+//the output image
+	      out_row*out_wid+out_col] = sum;
+      }
+    }
+}
+	void (*f_contig_3_flipped)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,false,false>;
+	void (*f_contig_4_flipped)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,true,false>;
+	void (*f_contig_5_flipped)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,true,false,true>;
+	void (*f_3_flipped)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,false,false>;
+	void (*f_4_flipped)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,true,false>;
+	void (*f_5_flipped)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<true,%(kern_wid)s,false,false,true>;
+	void (*f_contig_3)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,false,false>;
+	void (*f_contig_4)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,true,false>;
+	void (*f_contig_5)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,true,false,true>;
+	void (*f_3)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,false,false>;
+	void (*f_4)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,true,false>;
+	void (*f_5)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int)=conv_full_patch_stack_padded<false,%(kern_wid)s,false,false,true>;
+"""%locals()
+    def c_code(self, node, nodename, (img, kern), (out,), sub):
+        out_=node.outputs[0]
+        img_=node.inputs[0]
+        kern_=node.inputs[1]
+        subsample_rows=self.subsample[0]
+        subsample_cols=self.subsample[1]
+        version=self.version
+        verbose=self.verbose
+        #todo assert out is ccontiguous
+        img_wid = self.logical_img_hw[1]
+        img_len = self.logical_img_hw[0]
+        kern_wid = self.logical_kern_hw[1]
+        kern_len=self.logical_kern_hw[0]
+        img_wid_padded=self.logical_img_hw[1]+2*self.logical_kern_hw[1]-2;
+        img_len_padded=self.logical_img_hw[0]+2*self.logical_kern_hw[0]-2;
+        img_size_padded=img_len_padded * img_wid_padded;
+        out_dim_2, out_dim_3 = self.logical_output_shape_2d(self.logical_img_hw,self.logical_kern_hw,self.border_mode)
+        fail=sub['fail']
+        if True and self.subsample==(1,1) and self.border_mode=='full' and self.version in [3,4,5,-1] and out_dim_3<=512 and ((self.logical_kern_hw[0]+2*self.logical_kern_hw[0]-2)*img_wid_padded*4 + self.logical_kern_hw[0]*self.logical_kern_hw[1]*4<(16*1024-128)) and out_.dtype=='float32' and kern_.dtype=='float32' and img_.dtype=='float32':#-128 as this is the number of shared memory used statically
+            return """
+CudaNdarray* img = %(img)s;
+CudaNdarray* kern = %(kern)s;
+CudaNdarray* out_ = %(out)s;
+CudaNdarray* out = out;
+int version = %(version)s;
+const int verbose = %(verbose)s;
+    if (!img || img->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required img of 4D");
+        return -1;
+    }
+    if (! kern || kern->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
+        return -1;
+    }
+    int out_dim[4]={CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
+                    %(out_dim_2)s, %(out_dim_3)s};
+    if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out) 
+	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
+	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
+	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
+	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){
+      if (out)
+      {
+          Py_DECREF(out);
+          fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\\n");
+      }
+      out = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+      %(out)s = out;
+    }
+    if (! out || out->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required out of 4D");
+        return -1;
+    }
+    if (%(subsample_rows)s==1 && %(subsample_cols)s==1)
+    {
+        //TODO: rethink these asserts in light of the difference between physical and logical dimensions
+        assert (CudaNdarray_HOST_DIMS(out)[2] == CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1);
+        assert (CudaNdarray_HOST_DIMS(out)[3] == CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1);
+    }
+    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+    //TODO: make separate version as if all fill this is slower. 
+    //TODO: make a parameter the number of division
+    //TODO: Should we make them in separate grid block instead?
+    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int img_wid=%(img_wid)s;
+    const int img_len=%(img_len)s;
+    const int kern_wid=%(kern_wid)s;
+    const int kern_len=%(kern_len)s;
+    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+    const int img_size=img_len*img_wid;
+    const int kern_size=%(kern_len)s*%(kern_wid)s;
+    const int out_size=out_len*out_wid;
+    const int img_size_byte = img_size*sizeof(float);
+    const int kern_size_byte = kern_size*sizeof(float);
+    const int out_size_byte = out_size*sizeof(float);
+    bool subsample = %(subsample_rows)s!=1 || %(subsample_cols)s!=1;
+    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
+    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
+    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==%(kern_wid)s);
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
+    //allow to use a version that use less registers(so is faster)
+    //the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it.
+    bool kern_flipped=true;
+    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
+    float * kern_data_unflipped = kern->devdata;
+    int kern_stride_col_unflipped=kern_stride_col;
+    int kern_stride_row_unflipped=kern_stride_row;
+   if (!subsample &&
+	out_contiguous &&
+	(version==3||version==4||version==5||version==-1) &&
+	out_wid<512 &&//Maximum of 512 threads by block
+	(%(kern_len)s+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte<16*1024 //their is only 16k of shared memory
+	) //conv_full_patch_stack_padded
+    {
+      //version 3 without split
+      //version 4 with split (more registers)
+      //version 5 with split (more registers) low mem version(some restriction and still more register)
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+	if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
+	if(%(kern_len)s==1 && version==5){
+	  //version 5 don't support %(kern_len)s==1 as 1%%0 return -1.
+	  version=-1;
+	  if(verbose)printf("WARNING:conv full: Asking version 5 with %(kern_len)s==1. Combination not supported!\\n");
+	}
+	if(%(img_size_padded)s*4+kern_size_byte>16*1024) version=5;
+	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+	//Max of 16k of shared memory
+	if(version==5)
+	  while ((((%(kern_len)s+ceil_intdiv(out_len,nb_split)-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte)>16*1024) nb_split++;
+	//327 as we use 25 register
+	//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
+	int max_thread = (version!=5?327:450);
+	while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
+	if(version==-1 && out_size>512)version=4;
+	if(version==-1)version=3;
+	if(version==-1 && nb_split>1) version=4;
+	else if(version==-1) version=3;
+	else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
+	assert(version!=3 || nb_split==1);
+	assert(version!=5 || %(kern_len)s>1);
+	assert(version!=-1);
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+        dim3 grid(nbatch,nkern);
+	int shared_size=%(img_size_padded)s*4 + kern_size_byte;
+	if(version==5)
+	  shared_size=((%(kern_len)s+threads.y-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte;
+	void (*f)(float*, float*, float*,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int, int, int,
+		  int, int);
+        if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=f_contig_3_flipped;
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=f_contig_4_flipped;
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=f_contig_5_flipped;
+	else if(version==3 && kern_flipped) f=f_3_flipped;
+	else if(version==4 && kern_flipped) f=f_4_flipped;
+	else if(version==5 && kern_flipped) f=f_5_flipped;
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=f_contig_3;
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=f_contig_4;
+	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=f_contig_5;
+	else if(version==3) f=f_3;
+	else if(version==4) f=f_4;
+	else if(version==5) f=f_5;
+	else assert(false);
+	f<<< grid, threads, shared_size>>>
+	     (img->devdata, kern_data_unflipped, out->devdata,
+	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	      img_stride_col, img_stride_row, img_stride_stack,
+	      img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
+	      kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose>1) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i,shared_size=%%i, nb_threads=%%i, out_len=%%i, nb_split=%%i, version=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
+            if (verbose) printf("INFO: used 'conv_full_patch_stack_padded' nb_split=%%d low_mem=%%s\\n",nb_split,(version==5?"true":"false"));
+        }
+        else
+        {
+            if (verbose) printf("threads.x=%%i, threads.y=%%i, grid.x=%%i, grid.y=%%i,shared_size=%%i, nb_threads=%%i, out_len=%%i, nb_split=%%i, version=%%i\\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, out_len, nb_split, version);
+            if (verbose) printf("INFO: impl 'conv_full_patch_stack_padded' %%s %%s failed (%%s), trying next implementation\\n",
+				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
+                                cudaGetErrorString(sts));
+//TODO: raise an error!
+            PyErr_Format(PyExc_RuntimeError, "INFO: impl 'conv_full_patch_stack_padded' %%s %%s failed (%%s), trying next implementation\\n",
+				version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"),
+                                cudaGetErrorString(sts));
+            %(fail)s;
+        }                         
+    }
+"""%locals()
+        else:
+            super(GpuConv,self).c_code(node,nodename,(img, kern), (out,),sub)
 class GpuDownsampleFactorMax(Op):
    def __init__(self, ds, ignore_border=False):