change tab to space

7b68154f · Frederic · 7bec4ff8 · 7b68154f · 7b68154f · 7b68154f
--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -9,8 +9,8 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
 */
 int
 CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
-		       CudaNdarray * out, int subsample_rows, int subsample_cols,
-		       int version = -1, int verbose=0)
+                       CudaNdarray * out, int subsample_rows, int subsample_cols,
+                       int version = -1, int verbose=0)
 {
    int work_complete = 0;
    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
@@ -33,28 +33,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    if (verbose>1)
    {
        fprintf(stderr,
-		"INFO: Running conv_valid version=%d,"
-		" MACRO kern_width=%d with inputs:\n",
-		version, THEANO_KERN_WID);
+                "INFO: Running conv_valid version=%d,"
+                " MACRO kern_width=%d with inputs:\n",
+                version, THEANO_KERN_WID);
        fprintf(stderr,
-		"INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n",
+                "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n",
                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
-		CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
                CudaNdarray_HOST_STRIDES(img)[0],
-		CudaNdarray_HOST_STRIDES(img)[1],
-		CudaNdarray_HOST_STRIDES(img)[2],
-		CudaNdarray_HOST_STRIDES(img)[3]);
+                CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3]);
        fprintf(stderr,
-		"INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+                "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
-		CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
                CudaNdarray_HOST_STRIDES(kern)[0],
-		CudaNdarray_HOST_STRIDES(kern)[1],
-		CudaNdarray_HOST_STRIDES(kern)[2],
-		CudaNdarray_HOST_STRIDES(kern)[3]);
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3]);
        fprintf(stderr,
-		"INFO:   subsample_rows=%d, subsample_cols=%d\n",
-		subsample_rows, subsample_cols);
+                "INFO:   subsample_rows=%d, subsample_cols=%d\n",
+                subsample_rows, subsample_cols);
    }

    //Check the output size is valid
@@ -99,8 +99,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    const int out_size_byte = out_size*sizeof(float);
    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
      PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-		   " %d kernel columns, but the kernel we received had %d columns!",
-		   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
+                   " %d kernel columns, but the kernel we received had %d columns!",
+                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
      return -1;
    }

@@ -136,222 +136,222 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
    //we will lower the occupency due to shared memory and/or registers.
    if ((version == -1) &&
-	(out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
-	out_size<=256){
+        (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
+        out_size<=256){
      //condition for exec 
      if(!subsample &&
-	out_contiguous &&
-	out_size<512 &&//Maximum of 512 theads by block
-	 std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
-	!work_complete)
-	version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
+        out_contiguous &&
+        out_size<512 &&//Maximum of 512 theads by block
+         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+        !work_complete)
+        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
    }

    if (!subsample && c_contiguous &&
-	(version==0||version==2||version==-1) &&
-	out_wid<512 &&//Maximum of 512 theads by block
-	nstack == 1 &&// don't implement the stack in the kernel.
-	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_patch
+        (version==0||version==2||version==-1) &&
+        out_wid<512 &&//Maximum of 512 theads by block
+        nstack == 1 &&// don't implement the stack in the kernel.
+        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_patch
    {
        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-	while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
+        if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
+        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+        while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));

        dim3 grid(nbatch, nkern);
        int shared_size=(img_size + kern_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int);

 #define CONV_PATCH_SPECIAL(kern_wid) \
            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
            else f=conv_patch<true,kern_wid,true>;

-	CONV_PATCH_SPECIAL(THEANO_KERN_WID);
+        CONV_PATCH_SPECIAL(THEANO_KERN_WID);

-	 f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern->devdata, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack);
+         f<<< grid, threads, shared_size>>>
+             (img->devdata, kern->devdata, out->devdata,
+              img_len, img_wid, kern_len, kern_wid, nkern, nstack);
        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: used 'conv_patch' version %s nb_split=%d\n",
-		      threads.y==out_len ? "no split": "split", nb_split);
+              fprintf(stderr,
+                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
+                      threads.y==out_len ? "no split": "split", nb_split);
            work_complete = true;
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y, nb_split);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y, nb_split);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_patch' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_patch' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }
    if (out_contiguous &&
-	(version==1||version==3||version==11||version==12||version==-1) &&
-	(version!=1 || out_size<512) &&//Maximum of 512 theads by block
-	out_wid<512 &&//Maximum of 512 theads by block
-	img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_patch_stack
+        (version==1||version==3||version==11||version==12||version==-1) &&
+        (version!=1 || out_size<512) &&//Maximum of 512 theads by block
+        out_wid<512 &&//Maximum of 512 theads by block
+        img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_patch_stack
    {
      //version 1 is without split and preload the full kernel
      //version 3 is with split and preload the full kernel
      //version 11 is without split and load only 1 kernel row at a time.
      //version 12 is with split and load only 1 kernel row at a time.
        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
-	//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-	while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
+        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
+        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+        while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));

-	bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
-	if(version==11 || version==12) preload_full_kernel=false;
+        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
+        if(version==11 || version==12) preload_full_kernel=false;
        dim3 grid(nbatch,nkern);
        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);

-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);

 #define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-	if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
-	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
-	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
-	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
-	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
-	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
-	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
-	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
-	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
-	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
-	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
-	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
-	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
-	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
-	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
-	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
-	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
-	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
-	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
-	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
-	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
-	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
-	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
-
-	CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
-	f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern->devdata, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, 
-	      out_len, out_wid, nkern, nstack,
-	      img_stride_col, img_stride_row, img_stride_stack,
-	      img_stride_batch, kern_stride_col, kern_stride_row,
-	      kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
+        if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
+
+        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
+        f<<< grid, threads, shared_size>>>
+             (img->devdata, kern->devdata, out->devdata,
+              img_len, img_wid, kern_len, kern_wid, 
+              out_len, out_wid, nkern, nstack,
+              img_stride_col, img_stride_row, img_stride_stack,
+              img_stride_batch, kern_stride_col, kern_stride_row,
+              kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            if (verbose>1)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i,"
-		      " kern_flipped=true, accumulate=false, kern_width=%i,"
-		      " img_c_contiguous_2d=%i,"
-		      " kern_c_contiguous_2d=%i, nb_split=%i,"
-		      " preload_full_kernel=%i,",
-		      " subsample_rows=%i, subsample_cols=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y,
-		      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-		      nb_split, preload_full_kernel,
-		      subsample_rows, subsample_cols);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i,"
+                      " kern_flipped=true, accumulate=false, kern_width=%i,"
+                      " img_c_contiguous_2d=%i,"
+                      " kern_c_contiguous_2d=%i, nb_split=%i,"
+                      " preload_full_kernel=%i,",
+                      " subsample_rows=%i, subsample_cols=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,
+                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: used 'conv_patch_stack' version with nb_split=%i"
-		      " and preload_full_kernel=%i,"
-		      " subsample_rows=%i, subsample_cols=%i\n",
-		      nb_split, preload_full_kernel,
-		      subsample_rows, subsample_cols);
+              fprintf(stderr,
+                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
+                      " and preload_full_kernel=%i,"
+                      " subsample_rows=%i, subsample_cols=%i\n",
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
            work_complete = true;
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i,"
-		      " kern_flipped=true, accumulate=false,"
-		      " kern_width=%i, img_c_contiguous_2d=%i,"
-		      " kern_c_contiguous_2d=%i, nb_split=%i,"
-		      " preload_full_kernel=%i,"
-		      " subsample_rows=%i, subsample_cols=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y,
-		      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-		      nb_split, preload_full_kernel,
-		      subsample_rows, subsample_cols);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i,"
+                      " kern_flipped=true, accumulate=false,"
+                      " kern_width=%i, img_c_contiguous_2d=%i,"
+                      " kern_c_contiguous_2d=%i, nb_split=%i,"
+                      " preload_full_kernel=%i,"
+                      " subsample_rows=%i, subsample_cols=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,
+                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_patch_stack' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_patch_stack' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }

    if (!subsample && out_contiguous &&
-	(version==4||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	nstack == 1 &&// don't implement the stack in the kernel.
-	kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
-	!work_complete) //conv_rows
+        (version==4||version==-1) &&
+        out_wid<512 &&//Maximum of 512 threads by block
+        nstack == 1 &&// don't implement the stack in the kernel.
+        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
+        !work_complete) //conv_rows

    {
        dim3 threads(out_wid);
        dim3 grid(out_len, nbatch*nkern);
        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);

 #define CONV_ROWS_SPECIAL(kern_wid) \
-	if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
-	else f = conv_rows<kern_wid, true>;\
-
-	CONV_ROWS_SPECIAL(THEANO_KERN_WID);
-	f<<< grid, threads, shared_size >>>
-	  (img->devdata, kern->devdata, out->devdata,
-	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	   img_stride_col, img_stride_row,
-	   img_stride_stack,img_stride_batch,
-	   kern_stride_col, kern_stride_row,
-	   kern_stride_stack, kern_stride_nkern);
+        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
+        else f = conv_rows<kern_wid, true>;\
+
+        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
+        f<<< grid, threads, shared_size >>>
+          (img->devdata, kern->devdata, out->devdata,
+           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+           img_stride_col, img_stride_row,
+           img_stride_stack,img_stride_batch,
+           kern_stride_col, kern_stride_row,
+           kern_stride_stack, kern_stride_nkern);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
@@ -359,58 +359,58 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        {
            work_complete = true;
            if (verbose)
-	      fprintf(stderr, "INFO: used 'conv_rows' version\n");
+              fprintf(stderr, "INFO: used 'conv_rows' version\n");
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_rows' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }
    if (!subsample && out_contiguous &&
-	(version==5||version==-1) &&
-	out_wid<512 &&//Maximum of 512 theads by block
-	img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_rows_stack
+        (version==5||version==-1) &&
+        out_wid<512 &&//Maximum of 512 theads by block
+        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_rows_stack

    {
-	int nb_row=1;
-	int max_threads=512;
-	//TODO:if not c_contiguous, lower max_thread as we use 22
-	//registers by thread and we won't execute 2 block in one MP.
-	for(int i=2;i<=out_len;i++){
-	  if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
-	    nb_row=i;
-	}
+        int nb_row=1;
+        int max_threads=512;
+        //TODO:if not c_contiguous, lower max_thread as we use 22
+        //registers by thread and we won't execute 2 block in one MP.
+        for(int i=2;i<=out_len;i++){
+          if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
+            nb_row=i;
+        }

        dim3 threads(out_wid,nb_row);
        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);

        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);

-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);

        if (0)
          fprintf(stderr,
-		  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
-		  img_contiguous_2d, kern_contiguous_2d,
-		  threads.x, threads.y, threads.z,
-		  grid.x, grid.y, grid.z);
+                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
+                  img_contiguous_2d, kern_contiguous_2d,
+                  threads.x, threads.y, threads.z,
+                  grid.x, grid.y, grid.z);

-	if(!img_contiguous_2d || !kern_contiguous_2d) {
+        if(!img_contiguous_2d || !kern_contiguous_2d) {
            //fprintf(stderr, "using false version\n");
            f = conv_rows_stack<THEANO_KERN_WID, false>;
        } else {
@@ -418,136 +418,136 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            f = conv_rows_stack<THEANO_KERN_WID, true>;
        }

-	f<<< grid, threads, shared_size >>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	   img_stride_col, img_stride_row,
-	   img_stride_stack,img_stride_batch,
-	   kern_stride_col, kern_stride_row,
-	   kern_stride_stack, kern_stride_nkern);
+        f<<< grid, threads, shared_size >>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+           img_stride_col, img_stride_row,
+           img_stride_stack,img_stride_batch,
+           kern_stride_col, kern_stride_row,
+           kern_stride_stack, kern_stride_nkern);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
            work_complete = true;
-	    if (verbose>1)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y);
+            if (verbose>1)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
+              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_rows_stack' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows_stack' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }

    if (!subsample && out_contiguous &&
-	(version==9||version==10||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
-	(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
-	!work_complete) //conv_rows_stack2
+        (version==9||version==10||version==-1) &&
+        out_wid<512 &&//Maximum of 512 threads by block
+        (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
+        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
+        !work_complete) //conv_rows_stack2

    {
      // version 9:we preload the full kernel
      // version 10: load only a few row at a time.
-	int nb_row=1;
-	int max_threads=512;
-	int version_back = version;
-	//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
-	if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
-	  version = 9;
-	else if(version==-1)version = 10;
-
-	int k_size = kern_size;
-	if(version==10)
-	  k_size=kern_wid;
-
-	for(int i=2;i<=out_len;i++){
-	  if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
-	    nb_row=i;
-	}
-
-	//to test the case when we don't have a thread by output pixel.
-	if((version_back!=-1)&& nb_row>1) nb_row--;
+        int nb_row=1;
+        int max_threads=512;
+        int version_back = version;
+        //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
+        if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
+          version = 9;
+        else if(version==-1)version = 10;
+
+        int k_size = kern_size;
+        if(version==10)
+          k_size=kern_wid;
+
+        for(int i=2;i<=out_len;i++){
+          if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
+            nb_row=i;
+        }
+
+        //to test the case when we don't have a thread by output pixel.
+        if((version_back!=-1)&& nb_row>1) nb_row--;

        dim3 threads(out_wid,nb_row);
        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
-	  
+          
        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);

-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);

 #define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
-	if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
-	else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
-	else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-	else f = conv_rows_stack2<kern_wid, true, false>;
-
-	CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
-
-	f<<< grid, threads, shared_size >>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	   img_stride_col, img_stride_row,
-	   img_stride_stack,img_stride_batch,
-	   kern_stride_col, kern_stride_row,
-	   kern_stride_stack, kern_stride_nkern);
+        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
+        else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
+        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
+        else f = conv_rows_stack2<kern_wid, true, false>;
+
+        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
+
+        f<<< grid, threads, shared_size >>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+           img_stride_col, img_stride_row,
+           img_stride_stack,img_stride_batch,
+           kern_stride_col, kern_stride_row,
+           kern_stride_stack, kern_stride_nkern);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts) 
        {
            work_complete = true;
-	    if (verbose>1)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y);
+            if (verbose>1)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: used 'conv_rows_stack2' version %s with"
-		      " %d row(s).\n",
-		      (version==9?"'load full kernel'":
-		       "'load 1 kern row at a time'"),nb_row);
+              fprintf(stderr,
+                      "INFO: used 'conv_rows_stack2' version %s with"
+                      " %d row(s).\n",
+                      (version==9?"'load full kernel'":
+                       "'load 1 kern row at a time'"),nb_row);
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i version=%d\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y,(version==9?2:3));
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i version=%d\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,(version==9?2:3));
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_rows_stack2' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows_stack2' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }

@@ -559,24 +559,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
    //version 8 is needed to test more easily this kernel template parameter.
    //version 13 load only 1 kernel row at a time.
    if (!subsample &&
-	out_contiguous &&
-	out_size<512 &&//Maximum of 512 theads by block
-	(version==7||version==8||version==13||version==-1) &&
-	(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
+        out_contiguous &&
+        out_size<512 &&//Maximum of 512 theads by block
+        (version==7||version==8||version==13||version==-1) &&
+        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
        //version 13 need a minimal kernel length as big as the split.
-	(version!=13||kern_len>1) &&
-	!work_complete) //conv_patch_stack_reduce
+        (version!=13||kern_len>1) &&
+        !work_complete) //conv_patch_stack_reduce
    {
-	int nb_split=1;
-	int full_kern=true;
+        int nb_split=1;
+        int full_kern=true;

-	if(version==8||version==13) nb_split++;//force the split.
-	if(version==13)full_kern=false;
+        if(version==8||version==13) nb_split++;//force the split.
+        if(version==13)full_kern=false;

-	//check if we can fit the full kernel in the shared memory
-	if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
-	  full_kern = false;
-	}
+        //check if we can fit the full kernel in the shared memory
+        if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
+          full_kern = false;
+        }

        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
        // we need enough splits so that
@@ -586,24 +586,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,

        // constraint (a)
        // device 1.3 have a max of 64 thread in z
-	while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
+        while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;

        // constraint (b)
        //  (TODO: read the number of threads per block from the device)
-	while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
+        while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;

        // tentative estimates (prior to contraint c)
-	int thread_z=ceil_intdiv(kern_len,nb_split);
-	int shared_size = sizeof(float)*(full_kern
+        int thread_z=ceil_intdiv(kern_len,nb_split);
+        int shared_size = sizeof(float)*(full_kern
                ? std::max(img_size + kern_size, out_size*thread_z)
                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));

        // constraint (c)
        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
-	    //if we can't fit the kernel in shared memory, we must split it more.
+            //if we can't fit the kernel in shared memory, we must split it more.
            nb_split++;
            thread_z=ceil_intdiv(kern_len,nb_split);
-	    shared_size = sizeof(float)*(full_kern
+            shared_size = sizeof(float)*(full_kern
                ? std::max(img_size + kern_size, out_size*thread_z)
                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
        }
@@ -647,17 +647,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);

            if (verbose)
-	      fprintf(stderr,
-		      "INFO: using 'conv_patch_stack_reduce' version"
-		      " kern_flipped=%i ccontig=%i nb_split=%d,"
-		      " preload_full_kern=%d\n",
-		      kern_flipped, ccontig, nb_split, full_kern);
+              fprintf(stderr,
+                      "INFO: using 'conv_patch_stack_reduce' version"
+                      " kern_flipped=%i ccontig=%i nb_split=%d,"
+                      " preload_full_kern=%d\n",
+                      kern_flipped, ccontig, nb_split, full_kern);
            if (verbose>1)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
-		      " grid.y=%i, shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, threads.z, grid.x, grid.y,
-		      shared_size, threads.x * threads.y * threads.z);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
+                      " grid.y=%i, shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, threads.z, grid.x, grid.y,
+                      shared_size, threads.x * threads.y * threads.z);
            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
                                               img_len, img_wid, kern_len, kern_wid,
                                               nkern, nstack,
@@ -673,35 +673,35 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            else
            {
                if (verbose)
-		  fprintf(stderr,
-			  "threads.x=%i, threads.y=%i, threads.z=%i,"
-			  " grid.x=%i, grid.y=%i,shared_size=%i,"
-			  " nb_threads=%i\n",
-			  threads.x, threads.y, threads.z,
-			  grid.x, grid.y, shared_size,
-			  threads.x * threads.y * threads.z);
+                  fprintf(stderr,
+                          "threads.x=%i, threads.y=%i, threads.z=%i,"
+                          " grid.x=%i, grid.y=%i,shared_size=%i,"
+                          " nb_threads=%i\n",
+                          threads.x, threads.y, threads.z,
+                          grid.x, grid.y, shared_size,
+                          threads.x * threads.y * threads.z);
                if (verbose)
-		  fprintf(stderr,
-			  "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
-			  " trying next implementation\n",
-			  cudaGetErrorString(sts));
+                  fprintf(stderr,
+                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
+                          " trying next implementation\n",
+                          cudaGetErrorString(sts));
            }
        } // else no good nb_splits was found
    }

    if (1 && (version==6||version==-1) &&
-	kern_len<=320 &&
-	!work_complete) //conv_valid_row_reduce
+        kern_len<=320 &&
+        !work_complete) //conv_valid_row_reduce
    {
        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);

-	int block_nstack=nstack;
-	//Max of 512 threads per blocks.
-	//On old hardware, we have a max of 356 threads as we have only 
-	//8k registers and the kernel use 23 register
-	//TODO: check if we have 8k or 16k of register...
-	while(block_nstack*kern_len>320)block_nstack--;
+        int block_nstack=nstack;
+        //Max of 512 threads per blocks.
+        //On old hardware, we have a max of 356 threads as we have only 
+        //8k registers and the kernel use 23 register
+        //TODO: check if we have 8k or 16k of register...
+        while(block_nstack*kern_len>320)block_nstack--;
        dim3 n_threads(block_nstack, kern_len, 1);

        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
@@ -722,21 +722,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        }


-	void (*f)(int, int, int, int,
-		  int, int, int, int, int,
-		  float*, int, int, int, int,
-		  float*, int, int, int, int,
-		  float*, int, int, int, int,
-		  int, int, int);
+        void (*f)(int, int, int, int,
+                  int, int, int, int, int,
+                  float*, int, int, int, int,
+                  float*, int, int, int, int,
+                  float*, int, int, int, int,
+                  int, int, int);

        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
-	if(block_nstack==nstack)
-	  f=conv_valid_row_reduce<false>;
-	else
-	  f=conv_valid_row_reduce<true>;
-	f<<<n_blocks, n_threads, n_reduce_buf>>>(
+        if(block_nstack==nstack)
+          f=conv_valid_row_reduce<false>;
+        else
+          f=conv_valid_row_reduce<true>;
+        f<<<n_blocks, n_threads, n_reduce_buf>>>(
                nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
                img_len, img_wid,
                kern_len, kern_wid,
@@ -759,21 +759,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        {
            work_complete = true;
            if (verbose)
-	      fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
+              fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      n_threads.x, n_threads.y, n_blocks,
-		      n_reduce_buf, n_threads.x * n_threads.y);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      n_threads.x, n_threads.y, n_blocks,
+                      n_reduce_buf, n_threads.x * n_threads.y);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }

@@ -782,47 +782,47 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-				 NUM_VECTOR_OP_THREADS_PER_BLOCK);
+                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
        if (1)
        {
            if (verbose)
-	      fprintf(stderr, "INFO: launching conv_reference_valid\n");
+              fprintf(stderr, "INFO: launching conv_reference_valid\n");
            if (verbose>1)
-	      fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
-		      nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
-		      img->devdata,
-		      CudaNdarray_HOST_STRIDES(img)[0],
-		      CudaNdarray_HOST_STRIDES(img)[1],
-		      CudaNdarray_HOST_STRIDES(img)[2],
-		      CudaNdarray_HOST_STRIDES(img)[3]);
+              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+                      nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
+                      img->devdata,
+                      CudaNdarray_HOST_STRIDES(img)[0],
+                      CudaNdarray_HOST_STRIDES(img)[1],
+                      CudaNdarray_HOST_STRIDES(img)[2],
+                      CudaNdarray_HOST_STRIDES(img)[3]);
            if (verbose>1)
-	      fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
-		      nkern, nstack, kern_len, kern_wid,
-		      kern->devdata,
-		      CudaNdarray_HOST_STRIDES(kern)[0],
-		      CudaNdarray_HOST_STRIDES(kern)[1],
-		      CudaNdarray_HOST_STRIDES(kern)[2],
-		      CudaNdarray_HOST_STRIDES(kern)[3]);
+              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
+                      nkern, nstack, kern_len, kern_wid,
+                      kern->devdata,
+                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      CudaNdarray_HOST_STRIDES(kern)[3]);
            if (verbose>1)
-	      fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
-		      CudaNdarray_HOST_DIMS(out)[0],
-		      CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
-		      out->devdata,
-		      CudaNdarray_HOST_STRIDES(out)[0],
-		      CudaNdarray_HOST_STRIDES(out)[1],
-		      CudaNdarray_HOST_STRIDES(out)[2],
-		      CudaNdarray_HOST_STRIDES(out)[3]);
+              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(out)[0],
+                      CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
+                      out->devdata,
+                      CudaNdarray_HOST_STRIDES(out)[0],
+                      CudaNdarray_HOST_STRIDES(out)[1],
+                      CudaNdarray_HOST_STRIDES(out)[2],
+                      CudaNdarray_HOST_STRIDES(out)[3]);
            if (verbose>1)
-	      fprintf(stderr, "   launch params: %i %i %i\n",
-		      outsize, n_blocks, n_threads);
+              fprintf(stderr, "   launch params: %i %i %i\n",
+                      outsize, n_blocks, n_threads);
        }
        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
-		CudaNdarray_HOST_DIMS(img)[1],
+                CudaNdarray_HOST_DIMS(img)[1],
                img_len, img_wid,
                kern_len, kern_wid,
                out_len, out_wid,
                img->devdata,
-		CudaNdarray_HOST_STRIDES(img)[0],
+                CudaNdarray_HOST_STRIDES(img)[0],
                CudaNdarray_HOST_STRIDES(img)[1],
                CudaNdarray_HOST_STRIDES(img)[2],
                CudaNdarray_HOST_STRIDES(img)[3],
@@ -834,7 +834,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                out->devdata,
                CudaNdarray_HOST_STRIDES(out)[0],
                CudaNdarray_HOST_STRIDES(out)[1],
-		CudaNdarray_HOST_STRIDES(out)[2],
+                CudaNdarray_HOST_STRIDES(out)[2],
                CudaNdarray_HOST_STRIDES(out)[3],
                subsample_rows, subsample_cols);
        CNDA_THREAD_SYNC;
@@ -844,33 +844,33 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        {
            work_complete = true;
            if (verbose)
-	      fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
+              fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
        }
        else
        {
            PyErr_Format(PyExc_RuntimeError,
-			 "ERROR: all implementations failed for"
-			 " CudaNdarray_conv_valid! (%s)",
-			 cudaGetErrorString(sts));
+                         "ERROR: all implementations failed for"
+                         " CudaNdarray_conv_valid! (%s)",
+                         cudaGetErrorString(sts));
            return -1;
        }
    }
    if (!work_complete)
    {
      PyErr_Format(PyExc_RuntimeError,
-		   "ERROR: no implementation(s) worked for"
-		   " CudaNdarray_conv_valid!"
-		   " Version asked(%d) (-1 mean use an heuristic)",
-		   version);
-	return -1;
+                   "ERROR: no implementation(s) worked for"
+                   " CudaNdarray_conv_valid!"
+                   " Version asked(%d) (-1 mean use an heuristic)",
+                   version);
+        return -1;
    }
    return 0;
 }

 int
 CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
-		      CudaNdarray * out, int subsample_rows,
-		      int subsample_cols, int version = -1, int verbose=0)
+                      CudaNdarray * out, int subsample_rows,
+                      int subsample_cols, int version = -1, int verbose=0)
 {
  //144 is the biggest static shared size used with compiling this file.
    const int shared_avail = SHARED_SIZE - 150;
@@ -932,12 +932,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    //const int out_size_byte = out_size*sizeof(float); // unused 

    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
-	  (THEANO_KERN_WID == 0))){
+          (THEANO_KERN_WID == 0))){
      PyErr_Format(PyExc_ValueError,
-		   "ERROR: This GpuConv code was compiled for"
-		   " %d kernel columns, but the kernel we received"
-		   " had %d columns!",
-		   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
+                   "ERROR: This GpuConv code was compiled for"
+                   " %d kernel columns, but the kernel we received"
+                   " had %d columns!",
+                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
      return -1;
    }
    bool subsample = subsample_rows!=1 || subsample_cols!=1;
@@ -974,163 +974,163 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
    if (verbose>1)
    {
        printf("INFO: Running conv_full version=%d,"
-	       " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
+               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
-	       CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
-	       CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
-	       CudaNdarray_HOST_STRIDES(img)[0],
-	       CudaNdarray_HOST_STRIDES(img)[1],
-	       CudaNdarray_HOST_STRIDES(img)[2],
-	       CudaNdarray_HOST_STRIDES(img)[3]);
+               CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
+               CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+               CudaNdarray_HOST_STRIDES(img)[0],
+               CudaNdarray_HOST_STRIDES(img)[1],
+               CudaNdarray_HOST_STRIDES(img)[2],
+               CudaNdarray_HOST_STRIDES(img)[3]);
        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
-	       CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
-	       CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
-	       CudaNdarray_HOST_STRIDES(kern)[0],
-	       CudaNdarray_HOST_STRIDES(kern)[1],
-	       CudaNdarray_HOST_STRIDES(kern)[2],
-	       CudaNdarray_HOST_STRIDES(kern)[3]);
+               CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
+               CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+               CudaNdarray_HOST_STRIDES(kern)[0],
+               CudaNdarray_HOST_STRIDES(kern)[1],
+               CudaNdarray_HOST_STRIDES(kern)[2],
+               CudaNdarray_HOST_STRIDES(kern)[3]);
    }

    if (!subsample &&
-	out_contiguous &&
-	(version==3||version==4||version==5||version==-1) &&
-	out_wid<512 &&//Maximum of 512 threads by block
-	(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_full_patch_stack_padded
+        out_contiguous &&
+        (version==3||version==4||version==5||version==-1) &&
+        out_wid<512 &&//Maximum of 512 threads by block
+        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_full_patch_stack_padded
    {
      //version 3 without split
      //version 4 with split (more registers)
      //version 5 with split (more registers) low mem version(some restriction and still more register)
        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-	if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
-	if(kern_len==1 && version==5){
-	  //version 5 don't support kern_len==1 as 1%0 return -1.
-	  version=-1;
-	  if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
-	}
-	if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
-
-	//we pass by ceil_intdiv in case the out_len is not a multiple
-	//of nb_split, we want nb_split the number of iteration.
-	//Max of 16k of shared memory
-	if(version==5)
-	  while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
-	
-	//327 as we use 25 register
-	//version 5 will have only 1 block running at a time, so we
-	//can use 32 registers per threads, but their is some other stuff that
-	//for the limit to bu lower then 512.
-	int max_thread = (version!=5?327:450);
-	while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-	if(version==-1 && out_size>512)version=4;
-	if(version==-1)version=3;
-
-
-	if(version==-1 && nb_split>1) version=4;
-	else if(version==-1) version=3;
-	//force version 4 when more than 1 split are needed to always execute.
-	else if(version==3 && nb_split!=1) version=4;
-
-	assert(version!=3 || nb_split==1);
-	assert(version!=5 || kern_len>1);
-	assert(version!=-1);
+        if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
+        if(kern_len==1 && version==5){
+          //version 5 don't support kern_len==1 as 1%0 return -1.
+          version=-1;
+          if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
+        }
+        if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
+
+        //we pass by ceil_intdiv in case the out_len is not a multiple
+        //of nb_split, we want nb_split the number of iteration.
+        //Max of 16k of shared memory
+        if(version==5)
+          while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
+        
+        //327 as we use 25 register
+        //version 5 will have only 1 block running at a time, so we
+        //can use 32 registers per threads, but their is some other stuff that
+        //for the limit to bu lower then 512.
+        int max_thread = (version!=5?327:450);
+        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
+        if(version==-1 && out_size>512)version=4;
+        if(version==-1)version=3;
+
+
+        if(version==-1 && nb_split>1) version=4;
+        else if(version==-1) version=3;
+        //force version 4 when more than 1 split are needed to always execute.
+        else if(version==3 && nb_split!=1) version=4;
+
+        assert(version!=3 || nb_split==1);
+        assert(version!=5 || kern_len>1);
+        assert(version!=-1);

        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
        dim3 grid(nbatch,nkern);

-	int shared_size=img_size_padded_byte + kern_size_byte;
-	if(version==5)
-	  shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int);
+        int shared_size=img_size_padded_byte + kern_size_byte;
+        if(version==5)
+          shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);

 #define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
-	else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
-	else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
-	else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
-	else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
-	else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
-	else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
-	else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
-	else assert(false);
-
-	CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
-
-	f<<< grid, threads, shared_size>>>
-	     (img->devdata, kern_data_unflipped, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
-	      img_stride_col, img_stride_row, img_stride_stack,
-	      img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
-	      kern_stride_stack, kern_stride_nkern);
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
+        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
+        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
+        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
+        else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
+        else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
+        else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
+        else assert(false);
+
+        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
+
+        f<<< grid, threads, shared_size>>>
+             (img->devdata, kern_data_unflipped, out->devdata,
+              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+              img_stride_col, img_stride_row, img_stride_stack,
+              img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
+              kern_stride_stack, kern_stride_nkern);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
        if (cudaSuccess == sts)
        {
-	  if (verbose>1)
-	    fprintf(stderr,
-		    "threads.x=%i, threads.y=%i, threads.z=%i,"
-		    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
-		    " out_len=%i, nb_split=%i, version=%i\n",
-		    threads.x, threads.y, threads.z,
-		    grid.x, grid.y, shared_size,
-		    threads.x * threads.y * threads.z,
-		    out_len, nb_split, version);
+          if (verbose>1)
+            fprintf(stderr,
+                    "threads.x=%i, threads.y=%i, threads.z=%i,"
+                    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+                    " out_len=%i, nb_split=%i, version=%i\n",
+                    threads.x, threads.y, threads.z,
+                    grid.x, grid.y, shared_size,
+                    threads.x * threads.y * threads.z,
+                    out_len, nb_split, version);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: used 'conv_full_patch_stack_padded'"
-		      " nb_split=%d low_mem=%s\n",
-		      nb_split, (version==5?"true":"false"));
+              fprintf(stderr,
+                      "INFO: used 'conv_full_patch_stack_padded'"
+                      " nb_split=%d low_mem=%s\n",
+                      nb_split, (version==5?"true":"false"));
            work_complete = true;
        }
        else
        {
-	  if (verbose)
-	    fprintf(stderr,
-		    "threads.x=%i, threads.y=%i, threads.z=%i,"
-		    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
-		    " out_len=%i, nb_split=%i, version=%i\n",
-		    threads.x, threads.y, threads.z,
-		    grid.x, grid.y, shared_size,
-		    threads.x * threads.y * threads.z,
-		    out_len, nb_split, version);
-	  if (verbose)
-	    fprintf(stderr,
-		    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
-		    " failed (%s), trying next implementation\n",
-		    version==3?"no split": "split",
-		    (version==5?"low_mem":"not_low_mem"),
-		    cudaGetErrorString(sts));
+          if (verbose)
+            fprintf(stderr,
+                    "threads.x=%i, threads.y=%i, threads.z=%i,"
+                    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
+                    " out_len=%i, nb_split=%i, version=%i\n",
+                    threads.x, threads.y, threads.z,
+                    grid.x, grid.y, shared_size,
+                    threads.x * threads.y * threads.z,
+                    out_len, nb_split, version);
+          if (verbose)
+            fprintf(stderr,
+                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
+                    " failed (%s), trying next implementation\n",
+                    version==3?"no split": "split",
+                    (version==5?"low_mem":"not_low_mem"),
+                    cudaGetErrorString(sts));
        }                         
    }

    if (!subsample && c_contiguous &&
-	(version==0||version==-1) &&
-	out_size<512 &&//Maximum of 512 theads by block
-	nstack == 1 &&// don't implement the stack in the kernel.
-	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_full_patch
+        (version==0||version==-1) &&
+        out_size<512 &&//Maximum of 512 theads by block
+        nstack == 1 &&// don't implement the stack in the kernel.
+        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_full_patch
    {
        dim3 threads(out_wid, out_len);
        dim3 grid(nbatch,nkern);
        int shared_size=(img_size + kern_size)*sizeof(float);
-	//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
+        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.

-	conv_full_patch<<< grid, threads, shared_size>>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid,
-	   kern_len, kern_wid,
-	   nkern, nstack);
+        conv_full_patch<<< grid, threads, shared_size>>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid,
+           kern_len, kern_wid,
+           nkern, nstack);

        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
@@ -1142,44 +1142,44 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y, shared_size,
-		      threads.x * threads.y);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y, shared_size,
+                      threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr,
-		      "INFO: impl 'conv_full_patch' failed (%s),"
-		      " trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr,
+                      "INFO: impl 'conv_full_patch' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
        }                         
    }
    if (false && !subsample && //disabled as test fail for this kernel
-	(version==1||version==-1) &&
-	out_size<512 &&//Maximum of 512 theads by block
+        (version==1||version==-1) &&
+        out_size<512 &&//Maximum of 512 theads by block
        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
-	nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
-	!work_complete) //conv_full_load_everything
+        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
+        !work_complete) //conv_full_load_everything
    {
        dim3 threads(out_wid, out_len);
        dim3 grid(nbatch);
        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
-	//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
+        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.

        //typeof(conv_full_load_everything<0>) f = ;
-	void (*f)(float*, float*, float*,
-		  int, int, int, int, int, int,
-		  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
-
-	f = conv_full_load_everything<THEANO_KERN_WID>;
-
-	f<<< grid, threads, shared_size>>>
-	  (img->devdata,
-	   kern->devdata,
-	   out->devdata,
-	   img_len, img_wid, 
-	   kern_len, kern_wid,
-	   nkern, nstack,
+        void (*f)(float*, float*, float*,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
+
+        f = conv_full_load_everything<THEANO_KERN_WID>;
+
+        f<<< grid, threads, shared_size>>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid, 
+           kern_len, kern_wid,
+           nkern, nstack,
           CudaNdarray_HOST_STRIDES(img)[3],
           CudaNdarray_HOST_STRIDES(img)[2],
           CudaNdarray_HOST_STRIDES(img)[1],
@@ -1200,34 +1200,34 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y, shared_size,
-		      threads.x * threads.y);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y, shared_size,
+                      threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
-		      " failed (%s), trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
+                      " failed (%s), trying next implementation\n",
+                      cudaGetErrorString(sts));
        }
    }

    if (!subsample &&
-	img_batch_stack_contiguous &&
-	out_contiguous &&
-	(version==2||version==-1) &&
-	out_size<512 &&//Maximum of 512 theads by block
-	img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-	!work_complete) //conv_full_patch_stack
+        img_batch_stack_contiguous &&
+        out_contiguous &&
+        (version==2||version==-1) &&
+        out_size<512 &&//Maximum of 512 theads by block
+        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_full_patch_stack
    {
        dim3 threads(out_wid, out_len);
        dim3 grid(nbatch,nkern);
        int shared_size=(img_size + kern_size)*sizeof(float);

-	void (*f)(float*, float*, float*,
-		  int, int, int, int,
-		  int, int, int, int,
-		  int, int, int, int);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int);

        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
@@ -1248,20 +1248,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts) 
        {
            if (verbose)
-	      fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
+              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
            work_complete = true;
        }
        else
        {
            if (verbose)
-	      fprintf(stderr,
-		      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		      " shared_size=%i, nb_threads=%i\n",
-		      threads.x, threads.y, grid.x, grid.y,
-		      shared_size, threads.x * threads.y);
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
            if (verbose)
-	      fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
-		      cudaGetErrorString(sts));
+              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
+                      cudaGetErrorString(sts));
        }                         
    }
    if (1 && !work_complete) //conv_reference_full
@@ -1271,60 +1271,60 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        int outsize = CudaNdarray_SIZE(out);
        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
-				 NUM_VECTOR_OP_THREADS_PER_BLOCK);
+                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
        if (0)
        {
            if (verbose)
-	      fprintf(stderr, "INFO: launching conv_reference_valid\n");
+              fprintf(stderr, "INFO: launching conv_reference_valid\n");
            if (verbose)
-	      fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
-		      CudaNdarray_HOST_DIMS(img)[0],
-		      CudaNdarray_HOST_DIMS(img)[1],
-		      CudaNdarray_HOST_DIMS(img)[2],
-		      CudaNdarray_HOST_DIMS(img)[3],
-		      img->devdata,
-		      CudaNdarray_HOST_STRIDES(img)[0],
-		      CudaNdarray_HOST_STRIDES(img)[1],
-		      CudaNdarray_HOST_STRIDES(img)[2],
-		      CudaNdarray_HOST_STRIDES(img)[3]);
+              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(img)[0],
+                      CudaNdarray_HOST_DIMS(img)[1],
+                      CudaNdarray_HOST_DIMS(img)[2],
+                      CudaNdarray_HOST_DIMS(img)[3],
+                      img->devdata,
+                      CudaNdarray_HOST_STRIDES(img)[0],
+                      CudaNdarray_HOST_STRIDES(img)[1],
+                      CudaNdarray_HOST_STRIDES(img)[2],
+                      CudaNdarray_HOST_STRIDES(img)[3]);
            if (verbose)
-	      fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
-		      CudaNdarray_HOST_DIMS(kern)[0],
-		      CudaNdarray_HOST_DIMS(kern)[1],
-		      CudaNdarray_HOST_DIMS(kern)[2],
-		      CudaNdarray_HOST_DIMS(kern)[3],
-		      kern->devdata,
-		      CudaNdarray_HOST_STRIDES(kern)[0],
-		      CudaNdarray_HOST_STRIDES(kern)[1],
-		      CudaNdarray_HOST_STRIDES(kern)[2],
-		      CudaNdarray_HOST_STRIDES(kern)[3]
+              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(kern)[0],
+                      CudaNdarray_HOST_DIMS(kern)[1],
+                      CudaNdarray_HOST_DIMS(kern)[2],
+                      CudaNdarray_HOST_DIMS(kern)[3],
+                      kern->devdata,
+                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      CudaNdarray_HOST_STRIDES(kern)[3]
                        );
            if (verbose)
-	      fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
-		      CudaNdarray_HOST_DIMS(out)[0],
-		      CudaNdarray_HOST_DIMS(out)[1],
-		      CudaNdarray_HOST_DIMS(out)[2],
-		      CudaNdarray_HOST_DIMS(out)[3],
-		      out->devdata,
-		      CudaNdarray_HOST_STRIDES(out)[0],
-		      CudaNdarray_HOST_STRIDES(out)[1],
-		      CudaNdarray_HOST_STRIDES(out)[2],
-		      CudaNdarray_HOST_STRIDES(out)[3]);
+              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(out)[0],
+                      CudaNdarray_HOST_DIMS(out)[1],
+                      CudaNdarray_HOST_DIMS(out)[2],
+                      CudaNdarray_HOST_DIMS(out)[3],
+                      out->devdata,
+                      CudaNdarray_HOST_STRIDES(out)[0],
+                      CudaNdarray_HOST_STRIDES(out)[1],
+                      CudaNdarray_HOST_STRIDES(out)[2],
+                      CudaNdarray_HOST_STRIDES(out)[3]);
            if (verbose)
-	      fprintf(stderr, "   launch params: %i %i %i\n",
-		      outsize, n_blocks, n_threads);
+              fprintf(stderr, "   launch params: %i %i %i\n",
+                      outsize, n_blocks, n_threads);
            if (verbose)
-	      fprintf(stderr, "   subsample params: %i %i\n",
-		      subsample_rows, subsample_cols);
+              fprintf(stderr, "   subsample params: %i %i\n",
+                      subsample_rows, subsample_cols);
        }
        conv_reference_full<<<n_blocks, n_threads>>>(
                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
-		CudaNdarray_HOST_DIMS(img)[1],
+                CudaNdarray_HOST_DIMS(img)[1],
                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
                img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
-		CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[1],
                CudaNdarray_HOST_STRIDES(img)[2],
                CudaNdarray_HOST_STRIDES(img)[3],
                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
@@ -1342,28 +1342,28 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
        if (cudaSuccess == sts) 
        {
            if (verbose)
-	      fprintf(stderr, "INFO: used 'conv_reference_full' version"
-		      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
-		      " nkern=%d nstack=%d subsample=%d\n",
-		      img_len,img_wid, kern_len, kern_wid,
-		      out_len, out_wid, nbatch, nkern, nstack, subsample);
+              fprintf(stderr, "INFO: used 'conv_reference_full' version"
+                      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
+                      " nkern=%d nstack=%d subsample=%d\n",
+                      img_len,img_wid, kern_len, kern_wid,
+                      out_len, out_wid, nbatch, nkern, nstack, subsample);
            work_complete = true;
        }
        else
        {
-	  if (verbose)
-	    fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
-		    " shared_size=%i, nb_threads=%i\n",
-		    n_threads, 1, n_blocks, 1, 0, n_threads);
-	  if (verbose)
-	    fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
-		    " trying next implementation\n",
-		    cudaGetErrorString(sts));
-	  PyErr_Format(PyExc_RuntimeError,
-		       "ERROR: all implementations failed for"
-		       " CudaNdarray_conv_full! (%s)",
-		       cudaGetErrorString(sts));
-	  return -1;
+          if (verbose)
+            fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                    " shared_size=%i, nb_threads=%i\n",
+                    n_threads, 1, n_blocks, 1, 0, n_threads);
+          if (verbose)
+            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
+                    " trying next implementation\n",
+                    cudaGetErrorString(sts));
+          PyErr_Format(PyExc_RuntimeError,
+                       "ERROR: all implementations failed for"
+                       " CudaNdarray_conv_full! (%s)",
+                       cudaGetErrorString(sts));
+          return -1;
        }
    }
    return 0;
@@ -1371,9 +1371,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,

 PyObject *
 CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
-		 CudaNdarray * out, const int mode,
-		 const int subsample_rows, const int subsample_cols,
-		 const int version, const int verbose)
+                 CudaNdarray * out, const int mode,
+                 const int subsample_rows, const int subsample_cols,
+                 const int version, const int verbose)
 {
    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
@@ -1411,10 +1411,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    if ( out
         && out->nd==4
         && CudaNdarray_is_c_contiguous(out)
-	 && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
-	 && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
-	 && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
-	 && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
+         && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
+         && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
+         && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
+         && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
    {
      rval = out;
      Py_INCREF(rval);
@@ -1422,9 +1422,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
    else
    {
      if (out && verbose)
-	fprintf(stderr,
-		"INFO: Conv is ignoring 'out' argument with wrong"
-		" structure.\n");
+        fprintf(stderr,
+                "INFO: Conv is ignoring 'out' argument with wrong"
+                " structure.\n");
      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
      //rval might be null
    }

--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
@@ -37,20 +37,20 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
      int img_row = out_row;

      for (int row=0; row < kern_len; row++) {//loop over row
-	int inverse_row = (img_row-row);
-	if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
-
-	const float* idx_in=&d_img[inverse_row*img_wid];
-	const float* idx_kern=&d_kern[row*kern_wid];
-	int img_col = out_col;
-	int col=0,last=0;
-	for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
-	  if(last<0 ||last>=(img_wid))continue;//col outside the image	
-	  sum+=idx_in[last]*idx_kern[col];
-	}
+        int inverse_row = (img_row-row);
+        if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
+
+        const float* idx_in=&d_img[inverse_row*img_wid];
+        const float* idx_kern=&d_kern[row*kern_wid];
+        int img_col = out_col;
+        int col=0,last=0;
+        for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
+          if(last<0 ||last>=(img_wid))continue;//col outside the image        
+          sum+=idx_in[last]*idx_kern[col];
+        }
      }
      out[batch_id*out_len*out_wid+//the output image
-	  out_row*out_wid+out_col] = sum;
+          out_row*out_wid+out_col] = sum;
    }
 }

@@ -61,8 +61,8 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
 //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
 __global__ void
 conv_full_patch( float* img, float* kern, float* out,
-		 int img_len, int img_wid,
-		 int kern_len, int kern_wid, int nkern, int nstack)
+                 int img_len, int img_wid,
+                 int kern_len, int kern_wid, int nkern, int nstack)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
  out_len = img_len + kern_len - 1;
@@ -105,12 +105,12 @@ conv_full_patch( float* img, float* kern, float* out,
      
      if(img_col<0){col=-img_col;img_col+=col;}
      for (; col < max_col; col++, img_col++) {//loop over col
-	sum+=idx_in[col]*idx_kern[col];
+        sum+=idx_in[col]*idx_kern[col];
      }
    }
    out[batch_id*out_wid*out_len*nkern+//the good batch
-	out_wid*out_len*blockIdx.y+//the output image
-	out_row*out_wid+out_col] = sum;
+        out_wid*out_len*blockIdx.y+//the output image
+        out_row*out_wid+out_col] = sum;
 }

 //we store the full image and the full kernel in the shared memory
@@ -123,11 +123,11 @@ conv_full_patch( float* img, float* kern, float* out,
 template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
 __global__ void
 conv_full_patch_stack( float* img, float* kern, float* out,
-		       int img_len, int img_wid,
-		       int kern_len, int kern_wid, int nkern, int nstack,
-		       int img_stride_col, int img_stride_row,
-		       int kern_stride_col, int kern_stride_row, 
-		       int kern_stride_stack, int kern_stride_nkern)
+                       int img_len, int img_wid,
+                       int kern_len, int kern_wid, int nkern, int nstack,
+                       int img_stride_col, int img_stride_row,
+                       int kern_stride_col, int kern_stride_row, 
+                       int kern_stride_stack, int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
  out_len = img_len + kern_len - 1;
@@ -159,26 +159,26 @@ conv_full_patch_stack( float* img, float* kern, float* out,


      for (int row=0; row < kern_len; row++) {//loop over row
-	if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-	const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-	const float* idx_kern=&d_kern[row*kern_wid];
-	int col=0;
-	int max_col=kern_wid;
-	int img_col=out_col-kern_wid+1;
-	max_col=min(max_col,img_wid-img_col);
-
-	if(img_col<0){col=-img_col;img_col+=col;}
-	for (; col < max_col; col++, img_col++) {//loop over col
-	  sum+=idx_in[col]*idx_kern[col];
-	}
+        if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
+        const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
+        const float* idx_kern=&d_kern[row*kern_wid];
+        int col=0;
+        int max_col=kern_wid;
+        int img_col=out_col-kern_wid+1;
+        max_col=min(max_col,img_wid-img_col);
+
+        if(img_col<0){col=-img_col;img_col+=col;}
+        for (; col < max_col; col++, img_col++) {//loop over col
+          sum+=idx_in[col]*idx_kern[col];
+        }
      }
      //Needed as not all thread finish at the same time the loop
      //And we don't want to overwrite the shared memory.
      __syncthreads();
    }
    out[batch_id*out_wid*out_len*nkern+//the good batch
-	out_wid*out_len*blockIdx.y+//the output image
-	out_row*out_wid+out_col] = sum;
+        out_wid*out_len*blockIdx.y+//the output image
+        out_row*out_wid+out_col] = sum;
 }

 /**
@@ -202,13 +202,13 @@ conv_full_patch_stack( float* img, float* kern, float* out,
 template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
 __global__ void
 conv_full_patch_stack_padded( float* img, float* kern, float* out,
-		  const int img_len, const int img_wid,
-		  const int kern_len, const int kern_wid,
-		  const int nkern, const int nstack,
-		  const int img_stride_col, const int img_stride_row,
-		  const int img_stride_stack, const int img_stride_batch,
-		  const int kern_stride_col, const int kern_stride_row,
-		  const int kern_stride_stack, const int kern_stride_nkern)
+                  const int img_len, const int img_wid,
+                  const int kern_len, const int kern_wid,
+                  const int nkern, const int nstack,
+                  const int img_stride_col, const int img_stride_row,
+                  const int img_stride_stack, const int img_stride_batch,
+                  const int kern_stride_col, const int kern_stride_row,
+                  const int kern_stride_stack, const int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
  out_len = img_len + kern_len - 1;
@@ -242,26 +242,26 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
      const int out_row = ty;//output row
      float sum = 0.0f;
      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-	     img+=img_stride_stack){
-	  __syncthreads();
-	load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
-				  thread_id,nb_thread_id,img_wid,img_len,
-				  img_stride_col, img_stride_row, kern_wid-1,
-				  c_contiguous);
-	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	__syncthreads();
-
-	for (int row=0; row < kern_len; row++) {//loop over row
-	  const float* idx_kern=&d_kern[row*kern_wid];
-	  const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-	  
-	  convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-	}
+             img+=img_stride_stack){
+          __syncthreads();
+        load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
+                                  thread_id,nb_thread_id,img_wid,img_len,
+                                  img_stride_col, img_stride_row, kern_wid-1,
+                                  c_contiguous);
+        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
+
+        for (int row=0; row < kern_len; row++) {//loop over row
+          const float* idx_kern=&d_kern[row*kern_wid];
+          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+          
+          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+        }
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
-	  kern_id*out_wid*out_len+//the output image
-	  out_row*out_wid+out_col] = sum;
+          kern_id*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
    }else if(split && !low_mem){
      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
@@ -269,35 +269,35 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
      //TODO pass a parameter nb_split
      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
-	float sum = 0.0f;
-	for (int stack = 0;stack<nstack;stack++){
-	  __syncthreads();
-	  //TODO: load only the part of the image needed or put the partial result in shared memory
-	  load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
-				    img+img_stride_stack*stack,
-				    thread_id,nb_thread_id,img_wid,img_len,
-				    img_stride_col, img_stride_row, kern_wid-1,
-				    c_contiguous);
-	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
-			 thread_id, nb_thread_id, kern_wid,kern_len,
-			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	  __syncthreads();
-	  //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
-	  //Not needed on generation before as they worked anyway. Removing the if generate the good code
-	  //as we store the result of only the good thread.
-	  //This was with nvcc 3.0 on an GTX470 card.
-	  if(out_row<out_len)
-	    for (int row=0; row < kern_len; row++) {//loop over row
-	      const float* idx_kern=&d_kern[row*kern_wid];
-	      const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-	      
-	      convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-	    }
-	  if(out_row<out_len)
-	    out[batch_id*out_wid*out_len*nkern+//the good batch
-		out_wid*out_len*kern_id+//the output image
-		out_row*out_wid+out_col] = sum;
-	}
+        float sum = 0.0f;
+        for (int stack = 0;stack<nstack;stack++){
+          __syncthreads();
+          //TODO: load only the part of the image needed or put the partial result in shared memory
+          load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
+                                    img+img_stride_stack*stack,
+                                    thread_id,nb_thread_id,img_wid,img_len,
+                                    img_stride_col, img_stride_row, kern_wid-1,
+                                    c_contiguous);
+          load_to_shared(d_kern, kern+kern_stride_stack*stack,
+                         thread_id, nb_thread_id, kern_wid,kern_len,
+                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+          __syncthreads();
+          //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
+          //Not needed on generation before as they worked anyway. Removing the if generate the good code
+          //as we store the result of only the good thread.
+          //This was with nvcc 3.0 on an GTX470 card.
+          if(out_row<out_len)
+            for (int row=0; row < kern_len; row++) {//loop over row
+              const float* idx_kern=&d_kern[row*kern_wid];
+              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+              
+              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+            }
+          if(out_row<out_len)
+            out[batch_id*out_wid*out_len*nkern+//the good batch
+                out_wid*out_len*kern_id+//the output image
+                out_row*out_wid+out_col] = sum;
+        }
      }
    }else{//low_mem version
      //don't need to fill the last rows padding as this is done later.
@@ -306,46 +306,46 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
      __shared__ int out_len_max;
      //TODO pass a parameter nb_split
      if(thread_id==0)
-	out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
+        out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
      __syncthreads();
      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
-	  out_row+=nb_rows, out_row_iter++){
-	float sum = 0.0f;
-	for (int stack = 0;stack<nstack;stack++){
-	  __syncthreads();
-	  const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
-	  const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
-	  //we need to reload some row as when we change of out_row we lost the last load du to the stack.
-	  const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
-	  load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
-				    img+img_stride_stack*stack//the good stack image
-				    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
-				    thread_id,nb_thread_id,img_wid,
-				    len_to_load+previous_row,
-				    img_stride_col, img_stride_row, kern_wid-1,
-				    c_contiguous);
-	  //TODO: fill the last row padding only when needed.
-	  //We always fill the last rows padding event when not needed.
-	  int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
-	  row_to_fill = min(row_to_fill,kern_len-1);
-	  fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
-	       img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
-	  load_to_shared(d_kern, kern+kern_stride_stack*stack,
-			 thread_id, nb_thread_id, kern_wid,kern_len,
-			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	  __syncthreads();
-
-	  for (int row=0; row < kern_len; row++) {//loop over row
-	    const float* idx_kern=&d_kern[row*kern_wid];
-	    const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
-	    
-	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
-	  }
-	}
-	if(out_row<out_len)
-	  out[batch_id*out_wid*out_len*nkern+//the good batch
-	      out_wid*out_len*kern_id+//the output image
-	      out_row*out_wid+out_col] = sum;
+          out_row+=nb_rows, out_row_iter++){
+        float sum = 0.0f;
+        for (int stack = 0;stack<nstack;stack++){
+          __syncthreads();
+          const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
+          const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
+          //we need to reload some row as when we change of out_row we lost the last load du to the stack.
+          const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
+          load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
+                                    img+img_stride_stack*stack//the good stack image
+                                    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
+                                    thread_id,nb_thread_id,img_wid,
+                                    len_to_load+previous_row,
+                                    img_stride_col, img_stride_row, kern_wid-1,
+                                    c_contiguous);
+          //TODO: fill the last row padding only when needed.
+          //We always fill the last rows padding event when not needed.
+          int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
+          row_to_fill = min(row_to_fill,kern_len-1);
+          fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
+               img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
+          load_to_shared(d_kern, kern+kern_stride_stack*stack,
+                         thread_id, nb_thread_id, kern_wid,kern_len,
+                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+          __syncthreads();
+
+          for (int row=0; row < kern_len; row++) {//loop over row
+            const float* idx_kern=&d_kern[row*kern_wid];
+            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
+            
+            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+          }
+        }
+        if(out_row<out_len)
+          out[batch_id*out_wid*out_len*nkern+//the good batch
+              out_wid*out_len*kern_id+//the output image
+              out_row*out_wid+out_col] = sum;
      }
    }
 }
@@ -366,8 +366,8 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
 template<int NSTACK>
 __global__ void
 conv_full_load_everything( float* img, float* kern, float* out,
-		 int img_len, int img_wid,
-		 int kern_len, int kern_wid, int nkern, int nstack,
+                 int img_len, int img_wid,
+                 int kern_len, int kern_wid, int nkern, int nstack,
                 int img_stride_col, int img_stride_row,
                 int img_stride_stack, int img_stride_batch,
                 int kern_stride_col, int kern_stride_row, 

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -20,7 +20,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
      const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
      int new_n = (pos_n+dim_ker[1]-1);
      for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
-	sum+=idx_kern[k]*idx_in[last];
+        sum+=idx_kern[k]*idx_in[last];
      }
    }//for j
    out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
@@ -99,9 +99,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
 * We load from global memory to shared memory. The outer if is optimized away at compilation.
 */
 __device__ void load_to_shared(float * dst, const float * src, const int thread_id,
-			       int nb_thread, const int nb_col, const int nb_row,
-			       const int stride_col, const int stride_row,
-			       const bool flipped=false, const bool c_contiguous=true){
+                               int nb_thread, const int nb_col, const int nb_row,
+                               const int stride_col, const int stride_row,
+                               const bool flipped=false, const bool c_contiguous=true){
    if (c_contiguous)
    {
        load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
@@ -143,10 +143,10 @@ __device__ void fill(float * dst, int N, float value, int thread_id, int nb_thre
 * We put the image at the center of another one. Usefull to padd an image with 0.
 */
 __device__ void load_padded_col_to_shared(float * dst, const float * src, 
-					  const int thread_id, const int nb_thread,
-					  const int nb_col, const int nb_row, 
-					  const int stride_col, const int stride_row,
-					  const int wid_pad, const bool c_contiguous=true){
+                                          const int thread_id, const int nb_thread,
+                                          const int nb_col, const int nb_row, 
+                                          const int stride_col, const int stride_row,
+                                          const int wid_pad, const bool c_contiguous=true){
  if(c_contiguous){//flipped==false
    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
      int col=i%nb_col;
@@ -165,24 +165,24 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src,
 }

 template<int i> __device__ float convolutionRowNoFlip(const float *data,
-						      const float *kern){
+                                                      const float *kern){
    return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
  //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
 }

 template<> __device__ float convolutionRowNoFlip<1>(const float *data,
-						    const float *kern){
+                                                    const float *kern){
    return data[0]*kern[0];
 }
 template<> __device__ float convolutionRowNoFlip<0>(const float *data,
-						    const float *kern){
+                                                    const float *kern){
    return 0;
 }

 template<int KERN_WIDTH>
 __device__ void convolutionRowNoFlip(float& sum,
-				     const float *data,
-				     const float *kern, const int kern_wid){
+                                     const float *data,
+                                     const float *kern, const int kern_wid){
  if(KERN_WIDTH>0)
    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
  else
@@ -219,8 +219,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){
 template<bool flipped_kern, int KERN_WIDTH, bool split>
 __global__ void
 conv_patch( float* img, float* kern, float* out,
-	    int img_len, int img_wid, int kern_len, int kern_wid,
-	    int nkern, int nstack)
+            int img_len, int img_wid, int kern_len, int kern_wid,
+            int nkern, int nstack)
 {
  int __shared__ out_len, out_wid, nb_thread_id;
  out_len = img_len - kern_len + 1;
@@ -255,24 +255,24 @@ conv_patch( float* img, float* kern, float* out,
      int out_row = ty;//output row
      float sum = 0.0f;
      for (int row=0; row < kern_len; row++) {//loop over row
-	const float* idx_kern=&d_kern[row*kern_wid];
-	const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        const float* idx_kern=&d_kern[row*kern_wid];
+        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
-	  blockIdx.y*out_wid*out_len+//the output image
-	  out_row*out_wid+out_col] = sum;
+          blockIdx.y*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
    }else{
      for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
-	float sum = 0.0f;
-	for (int row=0; row < kern_len; row++) {//loop over row
-	  const float* idx_kern=&d_kern[row*kern_wid];
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-	}
-	out[batch_id*out_wid*out_len*nkern+//the good batch
-	    kern_id*out_wid*out_len+//the output image
-	    out_row*out_wid+out_col] = sum;
+        float sum = 0.0f;
+        for (int row=0; row < kern_len; row++) {//loop over row
+          const float* idx_kern=&d_kern[row*kern_wid];
+          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        }
+        out[batch_id*out_wid*out_len*nkern+//the good batch
+            kern_id*out_wid*out_len+//the output image
+            out_row*out_wid+out_col] = sum;
      }
    }
 }
@@ -302,12 +302,12 @@ conv_patch( float* img, float* kern, float* out,
 template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
 __global__ void
 conv_patch_stack( float* img, float* kern, float* out,
-		  int img_len, int img_wid, int kern_len, int kern_wid,
-		  int out_len, int out_wid,
-		  int nkern, int nstack, int img_stride_col,int img_stride_row,
-		  int img_stride_stack, int img_stride_batch,
-		  int kern_stride_col, int kern_stride_row,
-		  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
+                  int img_len, int img_wid, int kern_len, int kern_wid,
+                  int out_len, int out_wid,
+                  int nkern, int nstack, int img_stride_col,int img_stride_row,
+                  int img_stride_stack, int img_stride_batch,
+                  int kern_stride_col, int kern_stride_row,
+                  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
 {
  int __shared__ nb_thread_id;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
@@ -333,43 +333,43 @@ conv_patch_stack( float* img, float* kern, float* out,
      float sum = 0.0f;
      
      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-	     img+=img_stride_stack){
-	load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
-		       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-	if(preload_full_kern)
-	  load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-			 kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-
-	__syncthreads();
-	
-	for (int row=0; row < kern_len; row++) {//loop over row
-	  if(!preload_full_kern){
-	    __syncthreads();
-	    int idx2;
-	    if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
-	    else idx2=(row)*kern_stride_row;
-	    load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
-			   kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-	    __syncthreads();	      
-	  }
-
-	  const float* idx_kern;
-	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-	  else idx_kern=d_kern;
-	  const float* idx_in;
-	  if(subsample)
-	    idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-	  else
-	    idx_in=&d_img[(row+out_row)*img_wid+out_col];
-	  
-	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-	}
-	__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+             img+=img_stride_stack){
+        load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
+                       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
+        if(preload_full_kern)
+          load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+                         kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+
+        __syncthreads();
+        
+        for (int row=0; row < kern_len; row++) {//loop over row
+          if(!preload_full_kern){
+            __syncthreads();
+            int idx2;
+            if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
+            else idx2=(row)*kern_stride_row;
+            load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
+                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+            __syncthreads();              
+          }
+
+          const float* idx_kern;
+          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+          else idx_kern=d_kern;
+          const float* idx_in;
+          if(subsample)
+            idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+          else
+            idx_in=&d_img[(row+out_row)*img_wid+out_col];
+          
+          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        }
+        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
      }
      store_or_accumulate<accumulate>(
-				      out[batch_id*out_wid*out_len*nkern+//the good batch
-					  out_wid*out_len*kern_id+//the output image
-					  out_row*out_wid+out_col],sum);
+                                      out[batch_id*out_wid*out_len*nkern+//the good batch
+                                          out_wid*out_len*kern_id+//the output image
+                                          out_row*out_wid+out_col],sum);
    }else{

      float __shared__ *kern_, *img_;
@@ -383,54 +383,54 @@ conv_patch_stack( float* img, float* kern, float* out,
      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
      //TODO: do this happen elsewhere?
      for(;out_row<out_len_max;out_row+=blockDim.y){
-	float sum = 0.0f;
-	for (int stack = 0;stack<nstack;stack++){
-	  //TODO: load only the part of the image needed or put the partial result in shared memory
-	  int idx1=img_stride_stack*stack;
-	  load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
-			 img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-	  if(preload_full_kern){
-	    int idx2=kern_stride_stack*stack;
-	    load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
-			   kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-	  }
-	  __syncthreads();
-	  
-	  for (int row=0; row < kern_len; row++) {//loop over row
-	    if(!preload_full_kern){
-	      __syncthreads();
-	      int idx2=kern_stride_stack*stack;
-	      if(flipped_kern)
-		idx2+=(kern_len-row-1)*kern_stride_row;
-	      else
-		idx2+=(row)*kern_stride_row;
-	      load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
-			     kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-	      __syncthreads();	      
-	    }
-	    const float* idx_kern;
-	    if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-	    else idx_kern=d_kern;
-	    const float* idx_in;
-	    if(subsample)
-	      idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-	    else
-	      idx_in=&d_img[(row+out_row)*img_wid+out_col];
-	    
-	    //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-	    //Not needed on generation before as they worked anyway. Removing the if generate the good code
-	    //as we store the result of only the good thread.
-	    //This was with nvcc 3.0 on an GTX470 card.
-	    if(out_row<out_len)
-	      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-	  }
-	  __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-	}
-	if(out_row<out_len)
-	  store_or_accumulate<accumulate>(
-					  out[batch_id*out_wid*out_len*nkern+//the good batch
-					      out_wid*out_len*kern_id+//the output image
-					      out_row*out_wid+out_col],sum);
+        float sum = 0.0f;
+        for (int stack = 0;stack<nstack;stack++){
+          //TODO: load only the part of the image needed or put the partial result in shared memory
+          int idx1=img_stride_stack*stack;
+          load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
+                         img_stride_col, img_stride_row, false, img_c_contiguous_2d);
+          if(preload_full_kern){
+            int idx2=kern_stride_stack*stack;
+            load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
+                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+          }
+          __syncthreads();
+          
+          for (int row=0; row < kern_len; row++) {//loop over row
+            if(!preload_full_kern){
+              __syncthreads();
+              int idx2=kern_stride_stack*stack;
+              if(flipped_kern)
+                idx2+=(kern_len-row-1)*kern_stride_row;
+              else
+                idx2+=(row)*kern_stride_row;
+              load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
+                             kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+              __syncthreads();              
+            }
+            const float* idx_kern;
+            if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+            else idx_kern=d_kern;
+            const float* idx_in;
+            if(subsample)
+              idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+            else
+              idx_in=&d_img[(row+out_row)*img_wid+out_col];
+            
+            //if needed as on Fermi as reading out of bound index from shared memory generate an error.
+            //Not needed on generation before as they worked anyway. Removing the if generate the good code
+            //as we store the result of only the good thread.
+            //This was with nvcc 3.0 on an GTX470 card.
+            if(out_row<out_len)
+              convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          }
+          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+        }
+        if(out_row<out_len)
+          store_or_accumulate<accumulate>(
+                                          out[batch_id*out_wid*out_len*nkern+//the good batch
+                                              out_wid*out_len*kern_id+//the output image
+                                              out_row*out_wid+out_col],sum);
      }

    }
@@ -454,11 +454,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
 __global__ void
 conv_patch_stack_reduce( float* img, float* kern, float* out,
-		  int img_len, int img_wid, int kern_len, int kern_wid,
-		  int nkern, int nstack, int img_stride_col,int img_stride_row,
-		  int img_stride_stack, int img_stride_batch,
-		  int kern_stride_col, int kern_stride_row,
-		  int kern_stride_stack, int kern_stride_nkern)
+                  int img_len, int img_wid, int kern_len, int kern_wid,
+                  int nkern, int nstack, int img_stride_col,int img_stride_row,
+                  int img_stride_stack, int img_stride_batch,
+                  int kern_stride_col, int kern_stride_row,
+                  int kern_stride_stack, int kern_stride_nkern)
 {
  //int __shared__ out_len, out_wid, nb_thread_id;
  //out_len = img_len - kern_len + 1;
@@ -496,12 +496,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
    img+=img_stride_batch*batch_id;//the good batch

    for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-	   img+=img_stride_stack){
+           img+=img_stride_stack){
      __syncthreads();
      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
-		     img_stride_col, img_stride_row, false, c_contiguous);
+                     img_stride_col, img_stride_row, false, c_contiguous);
      if(split && ! preload_full_kern){
-	for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
+        for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
            // details, but that seemed really weird. tricky bug to find too.
@@ -510,36 +510,36 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
              : first_row;
          int len3 = min(blockDim.z, kern_len - first_row);

-	  __syncthreads();
-	  load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
-			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-	  __syncthreads();
-	  const float* idx_kern=&d_kern[tz*kern_wid];
-	  const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
-	  float sum2 = 0;
-	  if(tz<len3)
-	    convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
-	  sum+=sum2;
-	}
+          __syncthreads();
+          load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
+                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+          __syncthreads();
+          const float* idx_kern=&d_kern[tz*kern_wid];
+          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
+          float sum2 = 0;
+          if(tz<len3)
+            convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
+          sum+=sum2;
+        }
      }else if(split){
-	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
        __syncthreads();
-	for(int row=tz;row<kern_len;row+=blockDim.z){
-	  const float* idx_kern=&d_kern[row*kern_wid];
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
-	}
+        for(int row=tz;row<kern_len;row+=blockDim.z){
+          const float* idx_kern=&d_kern[row*kern_wid];
+          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        }
      }else{
-	int row = tz;//The row of the kernel.
-	const float* idx_kern=&d_kern[row*kern_wid];
-	const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-	load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-		       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        int row = tz;//The row of the kernel.
+        const float* idx_kern=&d_kern[row*kern_wid];
+        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
        __syncthreads();
-	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
      }
-	__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
    }

    //reduce no sync because previous loop ends with sync
@@ -548,11 +548,11 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
      //sum=0;
      for(int i=1;i<blockDim.z;i++){
-	sum+=d_reduce[thread_id+i*out_wid*out_len];
+        sum+=d_reduce[thread_id+i*out_wid*out_len];
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
-	  out_wid*out_len*blockIdx.y+//the output image
-	  out_row*out_wid+out_col] = sum;
+          out_wid*out_len*blockIdx.y+//the output image
+          out_row*out_wid+out_col] = sum;
    }
 }

@@ -570,12 +570,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
 template<int KERN_WIDTH, bool c_contiguous>
 __global__ void
 conv_rows( float* img, float* kern, float* out,
-	   int img_len, int img_wid, int kern_len, int kern_wid,
-	   int nkern, int nstack,
-	   int img_stride_col, int img_stride_row,
-	   int img_stride_stack, int img_stride_batch,
-	   int kern_stride_col, int kern_stride_row,
-	   int kern_stride_stack, int kern_stride_nkern)
+           int img_len, int img_wid, int kern_len, int kern_wid,
+           int nkern, int nstack,
+           int img_stride_col, int img_stride_row,
+           int img_stride_stack, int img_stride_batch,
+           int kern_stride_col, int kern_stride_row,
+           int kern_stride_stack, int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
  float __shared__ *d_img, *d_kern;
@@ -599,9 +599,9 @@ conv_rows( float* img, float* kern, float* out,
    kern+=kern_stride_nkern*kern_id;//the good nkern

    load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
-		   img_stride_col, img_stride_row, false, c_contiguous);
+                   img_stride_col, img_stride_row, false, c_contiguous);
    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-		   kern_stride_col, kern_stride_row, true, c_contiguous);
+                   kern_stride_col, kern_stride_row, true, c_contiguous);

    __syncthreads();
    float sum = 0.0f;
@@ -613,8 +613,8 @@ conv_rows( float* img, float* kern, float* out,
    }

    out[batch_id*out_wid*out_len*nkern+//the good batch
-	kern_id*out_wid*out_len+//the output image
-	out_row*out_wid+out_col] = sum;
+        kern_id*out_wid*out_len+//the output image
+        out_row*out_wid+out_col] = sum;
 }

 /**
@@ -631,12 +631,12 @@ conv_rows( float* img, float* kern, float* out,
 template<int KERN_WIDTH, bool c_contiguous>
 __global__ void
 conv_rows_stack( float* img, float* kern, float* out,
-		 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-		 const int nkern, const int nstack,
-		 const int img_stride_col, const int img_stride_row,
-		 const int img_stride_stack, const int img_stride_batch,
-		 const int kern_stride_col, const int kern_stride_row,
-		 const int kern_stride_stack, const int kern_stride_nkern)
+                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
+                 const int nkern, const int nstack,
+                 const int img_stride_col, const int img_stride_row,
+                 const int img_stride_stack, const int img_stride_batch,
+                 const int kern_stride_col, const int kern_stride_row,
+                 const int kern_stride_stack, const int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
  float  __shared__ *d_img, *d_kern;
@@ -698,21 +698,21 @@ conv_rows_stack( float* img, float* kern, float* out,

      offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
      load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
-		     kern_stride_col, kern_stride_row, true, c_contiguous);
+                     kern_stride_col, kern_stride_row, true, c_contiguous);

      __syncthreads();

      for (int row=0; row < kern_len; row++) {//loop over row
-	const float* idx_kern=&d_kern[row*kern_wid];
-	const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
-	convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        const float* idx_kern=&d_kern[row*kern_wid];
+        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
+        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
      }
      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
    }
    if (out_row < out_len)
      out[batch_id*out_wid*out_len*nkern+//the good batch
-	  kern_id*out_wid*out_len+//the output image
-	  out_row*out_wid+out_col] = sum;
+          kern_id*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
 }

 /**
@@ -729,12 +729,12 @@ conv_rows_stack( float* img, float* kern, float* out,
 template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
 __global__ void
 conv_rows_stack2( float* img, float* kern, float* out,
-		 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-		 const int nkern, const int nstack,
-		 const int img_stride_col, const int img_stride_row,
-		 const int img_stride_stack, const int img_stride_batch,
-		 const int kern_stride_col, const int kern_stride_row,
-		 const int kern_stride_stack, const int kern_stride_nkern)
+                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
+                 const int nkern, const int nstack,
+                 const int img_stride_col, const int img_stride_row,
+                 const int img_stride_stack, const int img_stride_batch,
+                 const int kern_stride_col, const int kern_stride_row,
+                 const int kern_stride_stack, const int kern_stride_nkern)
 {
  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
  float  __shared__ *d_img, *d_kern;
@@ -763,54 +763,54 @@ conv_rows_stack2( float* img, float* kern, float* out,
    
      __syncthreads();
      load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
-      		     img_stride_col, img_stride_row, false, c_contiguous);
+                           img_stride_col, img_stride_row, false, c_contiguous);
      if(preload_full_kern)
-	load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
-		       thread_id, nb_thread_id, kern_wid,kern_len,
-		       kern_stride_col, kern_stride_row, true, c_contiguous);
+        load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
+                       thread_id, nb_thread_id, kern_wid,kern_len,
+                       kern_stride_col, kern_stride_row, true, c_contiguous);
      __syncthreads();

      for (int row=0; row < kern_len; row++) {//loop over row
-	__syncthreads();
-	if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
-	  int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-	  _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-	  _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
-	  load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
-			 img+_idx1, thread_id, nb_thread_id, img_wid, 1,
-			 img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
-	}
-
-	if(!preload_full_kern){
-	  int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
-	  _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
-	  load_to_shared(d_kern, kern+_idx3,
-			 thread_id, nb_thread_id, kern_wid,1,
-			 kern_stride_col, kern_stride_row, true, c_contiguous);
-
-	}
-	__syncthreads();
-
-	//if needed as on Fermi as reading out of bound index from shared memory generate an error.
-	//Not needed on generation before as they worked anyway. Removing the if generate the good code
-	//as we store the result of only the good thread.
-	//This was with nvcc 3.0 on an GTX470 card.
-	if(out_row<out_len){
-	  const float* idx_kern;
-	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-	  else idx_kern=d_kern;
-	  const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
-	  float sum_ =0.0f;
-	  convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
-	  sum+=sum_;//We pass by an intermediate variable to have more precission.
-	}
+        __syncthreads();
+        if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
+          int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
+          _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
+          _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
+          load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
+                         img+_idx1, thread_id, nb_thread_id, img_wid, 1,
+                         img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
+        }
+
+        if(!preload_full_kern){
+          int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
+          _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
+          load_to_shared(d_kern, kern+_idx3,
+                         thread_id, nb_thread_id, kern_wid,1,
+                         kern_stride_col, kern_stride_row, true, c_contiguous);
+
+        }
+        __syncthreads();
+
+        //if needed as on Fermi as reading out of bound index from shared memory generate an error.
+        //Not needed on generation before as they worked anyway. Removing the if generate the good code
+        //as we store the result of only the good thread.
+        //This was with nvcc 3.0 on an GTX470 card.
+        if(out_row<out_len){
+          const float* idx_kern;
+          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+          else idx_kern=d_kern;
+          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
+          float sum_ =0.0f;
+          convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
+          sum+=sum_;//We pass by an intermediate variable to have more precission.
+        }
      }
    }
    __syncthreads();
    if(out_row<out_len)
      out[batch_id*out_wid*out_len*nkern+//the good batch
-	  kern_id*out_wid*out_len+//the output image
-	  out_row*out_wid+out_col] = sum;
+          kern_id*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
 }

 /**
@@ -854,8 +854,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
        int img_rr = iR_logical + kern_len - 1 - rr;
        int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
        float sum = 0.0f;
-	if(stack_loop){
-	  for (; ss < stacklen; ss+=blockDim.x){
+        if(stack_loop){
+          for (; ss < stacklen; ss+=blockDim.x){
            float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
            float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
            for (int cc = 0; cc < kern_wid; ++cc)
@@ -864,17 +864,17 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
                kk_0 += kern_str_C;
                ii_0 -= img_str_C;
            }
-	  }
-	}else{
-	  float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-	  float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-	  for (int cc = 0; cc < kern_wid; ++cc)
-	  {
-	    sum +=  kk_0[0] * ii_0[0];
-	    kk_0 += kern_str_C;
-	    ii_0 -= img_str_C;
          }
-	}
+        }else{
+          float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+          float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+          for (int cc = 0; cc < kern_wid; ++cc)
+          {
+            sum +=  kk_0[0] * ii_0[0];
+            kk_0 += kern_str_C;
+            ii_0 -= img_str_C;
+          }
+        }

        if (blockDim.x * blockDim.y == 1)
        {