Copy the old GpuConv to the new back-end.

50677701 · Frederic · e05e801f · 50677701 · 50677701 · 50677701
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
+// REMEMBER TO RAISE c_code_cache_version when changing this file
+//
+enum { ConvMode_FULL, ConvMode_VALID };
+PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
+
+/*
+ * version: -1, autodetect, >=0 a specific version to use.
+ *          If it can't be executed, we revert to the reference implementation
+ */
+int
+CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
+                       CudaNdarray * out, int subsample_rows, int subsample_cols,
+                       int version = -1, int verbose=0,
+                       int max_threads_dim0 = 512
+                       )
+{
+    int work_complete = 0;
+    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
+    if (img->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required img of 4D");
+        return -1;
+    }
+    if (kern->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
+        return -1;
+    }
+    if (out->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required out of 4D");
+        return -1;
+    }
+    
+    if (verbose>1)
+    {
+        fprintf(stderr,
+                "INFO: Running conv_valid version=%d,"
+                " MACRO kern_width=%d with inputs:\n",
+                version, THEANO_KERN_WID);
+        fprintf(stderr,
+                "INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n",
+                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
+                CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_STRIDES(img)[0],
+                CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3]);
+        fprintf(stderr,
+                "INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+                CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
+                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_STRIDES(kern)[0],
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3]);
+        fprintf(stderr,
+                "INFO:   out dim: %i %i %i %i  out stride: %i %i %i %i\n",
+               CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
+               CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+               CudaNdarray_HOST_STRIDES(out)[0],
+               CudaNdarray_HOST_STRIDES(out)[1],
+               CudaNdarray_HOST_STRIDES(out)[2],
+               CudaNdarray_HOST_STRIDES(out)[3]);
+        fprintf(stderr,
+                "INFO:   subsample_rows=%d, subsample_cols=%d\n",
+                subsample_rows, subsample_cols);
+    }
+
+    //Check the output size is valid
+    assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2]- CudaNdarray_HOST_DIMS(kern)[2] + 1, subsample_rows));
+    assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3]- CudaNdarray_HOST_DIMS(kern)[3] + 1, subsample_cols));
+
+    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+
+    // we now search through a few implementations until one applies to our arguments.
+
+    //TODO: make separate version as if all fill this is slower.
+    //TODO: Make a switch with power of 2 max size as template
+    //TODO: make a parameter the number of division
+    //TODO: Should we make them in separate grid block instead?
+ 
+    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
+    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
+    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
+    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+
+    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_stack= CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+
+    const int img_size=img_len*img_wid;
+    const int kern_size=kern_len*kern_wid;
+    const int out_size=out_len*out_wid;
+    const int img_size_byte = img_size*sizeof(float);
+    const int kern_size_byte = kern_size*sizeof(float);
+    const int out_size_byte = out_size*sizeof(float);
+    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
+      PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
+                   " %d kernel columns, but the kernel we received had %d columns!",
+                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
+      return -1;
+    }
+
+    bool subsample = subsample_rows!=1 || subsample_cols!=1;
+    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
+
+    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
+    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
+
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the
+    // stride and not flipping the kernel in shared memroy
+    //allow to use a version that use less registers(so is faster)
+    //the unflipped version of variable have the original value when
+    //we don't need to unflip it, but have the new value when we unflip it.
+    bool kern_flipped=true;
+    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
+    float * kern_data_unflipped = kern->devdata;
+    int kern_stride_col_unflipped=kern_stride_col;
+    int kern_stride_row_unflipped=kern_stride_row;
+    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
+      //the last two dimensions are c_contiguous but flipped!
+      kern_stride_col_unflipped=1;
+      kern_stride_row_unflipped=kern_wid;
+      kern_flipped=false;
+      kern_contiguous_2d_unflipped = true;
+      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+    }
+
+    //if we remove the restriction
+    //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
+    //we will lower the occupency due to shared memory and/or registers.
+    if ((version == -1) &&
+        (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
+        out_size<=256){
+      //condition for exec 
+      if(!subsample &&
+        out_contiguous &&
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
+         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+        !work_complete)
+        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
+    }
+
+    if (!subsample && c_contiguous &&
+        (version==0||version==2||version==-1) &&
+        out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
+        nstack == 1 &&// don't implement the stack in the kernel.
+        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_patch
+    {
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+        if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
+        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
+            nb_split++;
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+
+        dim3 grid(nbatch, nkern);
+        int shared_size=(img_size + kern_size)*sizeof(float);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int);
+
+#define CONV_PATCH_SPECIAL(kern_wid) \
+            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
+            else f=conv_patch<true,kern_wid,true>;
+
+        CONV_PATCH_SPECIAL(THEANO_KERN_WID);
+
+         f<<< grid, threads, shared_size>>>
+             (img->devdata, kern->devdata, out->devdata,
+              img_len, img_wid, kern_len, kern_wid, nkern, nstack);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts)
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
+                      threads.y==out_len ? "no split": "split", nb_split);
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i, nb_split=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y, nb_split);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_patch' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+
+    if (out_contiguous &&
+        (version==1||version==3||version==11||version==12||version==-1) &&
+        (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
+        img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_patch_stack
+    {
+      //version 1 is without split and preload the full kernel
+      //version 3 is with split and preload the full kernel
+      //version 11 is without split and load only 1 kernel row at a time.
+      //version 12 is with split and load only 1 kernel row at a time.
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
+        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
+        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+
+        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
+        if(version==11 || version==12) preload_full_kernel=false;
+        dim3 grid(nbatch,nkern);
+        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
+
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);
+
+#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
+        if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
+
+        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
+        f<<< grid, threads, shared_size>>>
+             (img->devdata, kern->devdata, out->devdata,
+              img_len, img_wid, kern_len, kern_wid, 
+              out_len, out_wid, nkern, nstack,
+              img_stride_col, img_stride_row, img_stride_stack,
+              img_stride_batch, kern_stride_col, kern_stride_row,
+              kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts)
+        {
+            if (verbose>1)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i,"
+                      " kern_flipped=true, accumulate=false, kern_width=%i,"
+                      " img_c_contiguous_2d=%i,"
+                      " kern_c_contiguous_2d=%i, nb_split=%i,"
+                      " preload_full_kernel=%i,"
+                      " subsample_rows=%i, subsample_cols=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,
+                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
+                      " and preload_full_kernel=%i,"
+                      " subsample_rows=%i, subsample_cols=%i\n",
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i,"
+                      " kern_flipped=true, accumulate=false,"
+                      " kern_width=%i, img_c_contiguous_2d=%i,"
+                      " kern_c_contiguous_2d=%i, nb_split=%i,"
+                      " preload_full_kernel=%i,"
+                      " subsample_rows=%i, subsample_cols=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,
+                      THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
+                      nb_split, preload_full_kernel,
+                      subsample_rows, subsample_cols);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_patch_stack' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+
+    if (!subsample && out_contiguous &&
+        (version==4||version==-1) &&
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
+        nstack == 1 &&// don't implement the stack in the kernel.
+        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
+        !work_complete) //conv_rows
+
+    {
+        dim3 threads(out_wid);
+        dim3 grid(out_len, nbatch*nkern);
+        int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);
+
+#define CONV_ROWS_SPECIAL(kern_wid) \
+        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
+        else f = conv_rows<kern_wid, true>;\
+
+        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
+        f<<< grid, threads, shared_size >>>
+          (img->devdata, kern->devdata, out->devdata,
+           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+           img_stride_col, img_stride_row,
+           img_stride_stack,img_stride_batch,
+           kern_stride_col, kern_stride_row,
+           kern_stride_stack, kern_stride_nkern);
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts)
+        {
+            work_complete = true;
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_rows' version\n");
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+    if (!subsample && out_contiguous &&
+        (version==5||version==-1) &&
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
+        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_rows_stack
+
+    {
+        int nb_row=1;
+        //TODO:if not c_contiguous, lower max_thread as we use 22
+        //registers by thread and we won't execute 2 block in one MP.
+        for(int i=2;i<=out_len;i++){
+          if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
+            nb_row=i;
+        }
+
+        dim3 threads(out_wid,nb_row);
+        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
+
+        int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
+
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);
+
+        if (0)
+          fprintf(stderr,
+                  "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
+                  img_contiguous_2d, kern_contiguous_2d,
+                  threads.x, threads.y, threads.z,
+                  grid.x, grid.y, grid.z);
+
+        if(!img_contiguous_2d || !kern_contiguous_2d) {
+            //fprintf(stderr, "using false version\n");
+            f = conv_rows_stack<THEANO_KERN_WID, false>;
+        } else {
+            //fprintf(stderr, "using true version\n");
+            f = conv_rows_stack<THEANO_KERN_WID, true>;
+        }
+
+        f<<< grid, threads, shared_size >>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+           img_stride_col, img_stride_row,
+           img_stride_stack,img_stride_batch,
+           kern_stride_col, kern_stride_row,
+           kern_stride_stack, kern_stride_nkern);
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts)
+        {
+            work_complete = true;
+            if (verbose>1)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows_stack' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+
+    if (!subsample && out_contiguous &&
+        (version==9||version==10||version==-1) &&
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
+        (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
+        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
+        !work_complete) //conv_rows_stack2
+
+    {
+      // version 9:we preload the full kernel
+      // version 10: load only a few row at a time.
+        int nb_row=1;
+        int version_back = version;
+        //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
+        if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
+          version = 9;
+        else if(version==-1)version = 10;
+
+        int k_size = kern_size;
+        if(version==10)
+          k_size=kern_wid;
+
+        for(int i=2;i<=out_len;i++){
+          if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
+            nb_row=i;
+        }
+
+        //to test the case when we don't have a thread by output pixel.
+        if((version_back!=-1)&& nb_row>1) nb_row--;
+
+        dim3 threads(out_wid,nb_row);
+        dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
+          
+        int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
+
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);
+
+#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
+        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
+        else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
+        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
+        else f = conv_rows_stack2<kern_wid, true, false>;
+
+        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
+
+        f<<< grid, threads, shared_size >>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+           img_stride_col, img_stride_row,
+           img_stride_stack,img_stride_batch,
+           kern_stride_col, kern_stride_row,
+           kern_stride_stack, kern_stride_nkern);
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+            if (verbose>1)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_rows_stack2' version %s with"
+                      " %d row(s).\n",
+                      (version==9?"'load full kernel'":
+                       "'load 1 kern row at a time'"),nb_row);
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i version=%d\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y,(version==9?2:3));
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_rows_stack2' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+
+    //version 8 is the same but we force the split.
+    // The split is need in case we have too much threads.
+    // This happen frequently if the kernel length is big.
+    // Big kernel is frequent in the gradient.
+    //version 8 need a minimum of kernel length as we force the split.
+    //version 8 is needed to test more easily this kernel template parameter.
+    //version 13 load only 1 kernel row at a time.
+    if (!subsample &&
+        out_contiguous &&
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
+        (version==7||version==8||version==13||version==-1) &&
+        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
+        //version 13 need a minimal kernel length as big as the split.
+        (version!=13||kern_len>1) &&
+        !work_complete) //conv_patch_stack_reduce
+    {
+        int nb_split=1;
+        int full_kern=true;
+
+        if(version==8||version==13) nb_split++;//force the split.
+        if(version==13)full_kern=false;
+
+        //check if we can fit the full kernel in the shared memory
+        if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
+          full_kern = false;
+        }
+
+        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
+        // we need enough splits so that
+        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
+        // b) thread_z * out_len * out_wid fits in the thread count
+        // c) the kernel doesn't need too much shared memory
+
+        // constraint (a)
+        // device 1.3 have a max of 64 thread in z
+        while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
+
+        // constraint (b)
+        //  (TODO: read the number of threads per block from the device)
+        while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
+            nb_split++;
+
+        // tentative estimates (prior to contraint c)
+        int thread_z=ceil_intdiv(kern_len,nb_split);
+        int shared_size = sizeof(float)*(full_kern
+                ? std::max(img_size + kern_size, out_size*thread_z)
+                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
+
+        // constraint (c)
+        while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
+            //if we can't fit the kernel in shared memory, we must split it more.
+            nb_split++;
+            thread_z=ceil_intdiv(kern_len,nb_split);
+            shared_size = sizeof(float)*(full_kern
+                ? std::max(img_size + kern_size, out_size*thread_z)
+                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
+        }
+        if (nb_split <= kern_len)
+        {
+            assert(thread_z>0);//should not happen, but in case...
+            if(!full_kern) assert(thread_z!=kern_len);
+
+            dim3 threads(out_wid, out_len, thread_z);
+            dim3 grid(nbatch,nkern);
+
+            void (*f)(float*, float*, float*,
+                      int, int, int, int,
+                      int, int, int, int,
+                      int, int,
+                      int, int,
+                      int, int);
+
+            const bool split=thread_z!=kern_len;
+            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
+
+            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
+            //We will always be split when we don't load the full kernel
+#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
+                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
+                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
+                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
+                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
+                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
+                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
+                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
+                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
+                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
+                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
+                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
+                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
+                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
+                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
+                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
+                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
+            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
+
+            f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
+                                               img_len, img_wid, kern_len, kern_wid,
+                                               nkern, nstack,
+                                               img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
+                                               kern_stride_col_unflipped, kern_stride_row_unflipped,
+                                               kern_stride_stack, kern_stride_nkern);
+            CNDA_THREAD_SYNC;
+            cudaError_t sts = cudaGetLastError();
+            if (cudaSuccess == sts)
+            {
+                if (verbose>1)
+                    fprintf(stderr,
+                            "threads.x=%i, threads.y=%i, threads.z=%i, "
+                            "grid.x=%i, grid.y=%i, shared_size=%i,"
+                            " nb_threads=%i\n",
+                            threads.x, threads.y, threads.z, grid.x, grid.y,
+                            shared_size, threads.x * threads.y * threads.z);
+                if (verbose)
+                    fprintf(stderr,
+                            "INFO: used 'conv_patch_stack_reduce' version"
+                            " kern_flipped=%i ccontig=%i nb_split=%d,"
+                            " preload_full_kern=%d\n",
+                            kern_flipped, ccontig, nb_split, full_kern);
+                work_complete = true;
+            }
+            else
+            {
+                if (verbose)
+                  fprintf(stderr,
+                          "threads.x=%i, threads.y=%i, threads.z=%i,"
+                          " grid.x=%i, grid.y=%i,shared_size=%i,"
+                          " nb_threads=%i\n",
+                          threads.x, threads.y, threads.z,
+                          grid.x, grid.y, shared_size,
+                          threads.x * threads.y * threads.z);
+                if (verbose)
+                  fprintf(stderr,
+                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
+                          " trying next implementation\n",
+                          cudaGetErrorString(sts));
+            }
+        } // else no good nb_splits was found
+    }
+
+    if (1 && (version==6||version==-1) &&
+        kern_len<=320 &&
+        !work_complete) //conv_valid_row_reduce
+    {
+        int outsize = CudaNdarray_SIZE(out);
+        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+
+        int block_nstack=nstack;
+        //Max of 512 threads per blocks.
+        //On old hardware, we have a max of 356 threads as we have only 
+        //8k registers and the kernel use 23 register
+        //TODO: check if we have 8k or 16k of register...
+        while(block_nstack*kern_len>320)block_nstack--;
+        dim3 n_threads(block_nstack, kern_len, 1);
+
+        int n_reduce_buf = block_nstack * kern_len * sizeof(float);
+        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
+         *
+         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
+         * */
+        int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
+        if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
+            initial_reduce_boundary >>= 1;
+
+        if (n_reduce_buf == sizeof(float))
+            assert (initial_reduce_boundary == 0);
+        else
+        {
+            assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
+            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
+        }
+
+
+        void (*f)(int, int, int, int,
+                  int, int, int, int, int,
+                  float*, int, int, int, int,
+                  float*, int, int, int, int,
+                  float*, int, int, int, int,
+                  int, int, int);
+
+        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
+        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
+        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
+        if(block_nstack==nstack)
+          f=conv_valid_row_reduce<false>;
+        else
+          f=conv_valid_row_reduce<true>;
+        f<<<n_blocks, n_threads, n_reduce_buf>>>(
+                nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
+                img_len, img_wid,
+                kern_len, kern_wid,
+                out_len, out_wid,
+                img->devdata,
+                CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], 
+                img_stride_row, img_stride_col,
+                kern->devdata,
+                CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata,
+                CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1],
+                CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3],
+                subsample_rows, subsample_cols, initial_reduce_boundary);
+
+        CNDA_THREAD_SYNC;
+
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            work_complete = true;
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      n_threads.x, n_threads.y, n_blocks,
+                      n_reduce_buf, n_threads.x * n_threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+
+    if (1 && !work_complete) //conv_reference_valid
+    {
+        int outsize = CudaNdarray_SIZE(out);
+        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
+                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
+        if (1)
+        {
+            if (verbose)
+              fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose>1)
+              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+                      nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
+                      img->devdata,
+                      CudaNdarray_HOST_STRIDES(img)[0],
+                      CudaNdarray_HOST_STRIDES(img)[1],
+                      CudaNdarray_HOST_STRIDES(img)[2],
+                      CudaNdarray_HOST_STRIDES(img)[3]);
+            if (verbose>1)
+              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
+                      nkern, nstack, kern_len, kern_wid,
+                      kern->devdata,
+                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      CudaNdarray_HOST_STRIDES(kern)[3]);
+            if (verbose>1)
+              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(out)[0],
+                      CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
+                      out->devdata,
+                      CudaNdarray_HOST_STRIDES(out)[0],
+                      CudaNdarray_HOST_STRIDES(out)[1],
+                      CudaNdarray_HOST_STRIDES(out)[2],
+                      CudaNdarray_HOST_STRIDES(out)[3]);
+            if (verbose>1)
+              fprintf(stderr, "   launch params: %i %i %i\n",
+                      outsize, n_blocks, n_threads);
+        }
+        conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
+                CudaNdarray_HOST_DIMS(img)[1],
+                img_len, img_wid,
+                kern_len, kern_wid,
+                out_len, out_wid,
+                img->devdata,
+                CudaNdarray_HOST_STRIDES(img)[0],
+                CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3],
+                kern->devdata,
+                CudaNdarray_HOST_STRIDES(kern)[0],
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata,
+                CudaNdarray_HOST_STRIDES(out)[0],
+                CudaNdarray_HOST_STRIDES(out)[1],
+                CudaNdarray_HOST_STRIDES(out)[2],
+                CudaNdarray_HOST_STRIDES(out)[3],
+                subsample_rows, subsample_cols);
+        CNDA_THREAD_SYNC;
+
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts)
+        {
+            work_complete = true;
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
+            PyErr_Format(PyExc_RuntimeError,
+                         "ERROR: all implementations failed for"
+                         " CudaNdarray_conv_valid! (%s)",
+                         cudaGetErrorString(sts));
+            return -1;
+        }
+    }
+    if (!work_complete)
+    {
+      PyErr_Format(PyExc_RuntimeError,
+                   "ERROR: no implementation(s) worked for"
+                   " CudaNdarray_conv_valid!"
+                   " Version asked(%d) (-1 mean use an heuristic)",
+                   version);
+        return -1;
+    }
+    return 0;
+}
+
+int
+CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
+                      CudaNdarray * out, int subsample_rows,
+                      int subsample_cols, int version = -1, int verbose=0,
+                      int max_threads_dim0=512)
+{
+  //144 is the biggest static shared size used with compiling this file.
+    const int shared_avail = SHARED_SIZE - 150;
+
+    int work_complete = 0;
+    if (img->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required img of 4D");
+        return -1;
+    }
+    if (kern->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
+        return -1;
+    }
+    if (out->nd != 4)
+    {
+        PyErr_SetString(PyExc_ValueError, "required out of 4D");
+        return -1;
+    }
+    // check the size of the output matrix
+    assert (CudaNdarray_HOST_DIMS(out)[2] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1, subsample_rows));
+    assert (CudaNdarray_HOST_DIMS(out)[3] == ceil_intdiv(CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1, subsample_cols));
+
+    assert (CudaNdarray_HOST_DIMS(out)[0] == CudaNdarray_HOST_DIMS(img)[0]);
+    assert (CudaNdarray_HOST_DIMS(out)[1] == CudaNdarray_HOST_DIMS(kern)[0]);
+    assert (CudaNdarray_HOST_DIMS(img)[1] == CudaNdarray_HOST_DIMS(kern)[1]);
+
+    const int nstack=CudaNdarray_HOST_DIMS(kern)[1];
+    const int nbatch=CudaNdarray_HOST_DIMS(img)[0];
+    const int nkern=CudaNdarray_HOST_DIMS(kern)[0];
+    const int img_wid=CudaNdarray_HOST_DIMS(img)[3];
+    const int img_len=CudaNdarray_HOST_DIMS(img)[2];
+    const int kern_wid=CudaNdarray_HOST_DIMS(kern)[3];
+    const int kern_len=CudaNdarray_HOST_DIMS(kern)[2];
+    const int out_wid=CudaNdarray_HOST_DIMS(out)[3];
+    const int out_len=CudaNdarray_HOST_DIMS(out)[2];
+
+    const int img_stride_col= CudaNdarray_HOST_STRIDES(img)[3];
+    const int img_stride_row=CudaNdarray_HOST_STRIDES(img)[2];
+    const int img_stride_stack=CudaNdarray_HOST_STRIDES(img)[1];
+    const int img_stride_batch=CudaNdarray_HOST_STRIDES(img)[0];
+    const int kern_stride_col= CudaNdarray_HOST_STRIDES(kern)[3];
+    const int kern_stride_row=CudaNdarray_HOST_STRIDES(kern)[2];
+    const int kern_stride_stack= CudaNdarray_HOST_STRIDES(kern)[1];
+    const int kern_stride_nkern=CudaNdarray_HOST_STRIDES(kern)[0];
+
+    const int img_size=img_len*img_wid;
+    const int kern_size=kern_len*kern_wid;
+    const int out_size=out_len*out_wid;
+    const int img_size_byte = img_size*sizeof(float);
+    const int kern_size_byte = kern_size*sizeof(float);
+    //padded image sizes
+    const int img_wid_padded=img_wid+2*kern_wid-2;
+    const int img_len_padded=img_len+2*kern_len-2;
+    const int img_size_padded=img_len_padded * img_wid_padded;
+    const int img_size_padded_byte = img_size_padded*sizeof(float);
+    
+    //const int out_size_byte = out_size*sizeof(float); // unused 
+
+    if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
+          (THEANO_KERN_WID == 0))){
+      PyErr_Format(PyExc_ValueError,
+                   "ERROR: This GpuConv code was compiled for"
+                   " %d kernel columns, but the kernel we received"
+                   " had %d columns!",
+                   THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
+      return -1;
+    }
+    bool subsample = subsample_rows!=1 || subsample_cols!=1;
+
+    bool img_contiguous = CudaNdarray_is_c_contiguous(img);
+    bool kern_contiguous = CudaNdarray_is_c_contiguous(kern);
+    bool out_contiguous = CudaNdarray_is_c_contiguous(out);
+    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
+
+    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
+    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
+
+    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
+
+    //if the lower 2 dims are c_contiguous but flipped, unflipping the
+    //stride and not flipping the kernel in shared memroy
+    //allow to use a version that use less registers(so is faster)
+    //the unflipped version of variable have the original value when
+    //we don't need to unflip it, but have the new value when we unflip it.
+    bool kern_flipped=true;
+    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
+    float * kern_data_unflipped = kern->devdata;
+    int kern_stride_col_unflipped=kern_stride_col;
+    int kern_stride_row_unflipped=kern_stride_row;
+    if(kern_stride_col_unflipped==-1 && kern_stride_row_unflipped==-kern_wid){
+      //the last two dimensions are c_contiguous but flipped!
+      kern_stride_col_unflipped=1;
+      kern_stride_row_unflipped=kern_wid;
+      kern_flipped=false;
+      kern_contiguous_2d_unflipped = true;
+      kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
+    }
+
+    if (verbose>1)
+    {
+        printf("INFO: Running conv_full version=%d,"
+               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
+        printf("INFO:   img  dim: %i %i %i %i  img  stride: %i %i %i %i\n", 
+               CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
+               CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+               CudaNdarray_HOST_STRIDES(img)[0],
+               CudaNdarray_HOST_STRIDES(img)[1],
+               CudaNdarray_HOST_STRIDES(img)[2],
+               CudaNdarray_HOST_STRIDES(img)[3]);
+        printf("INFO:   kern dim: %i %i %i %i  kern stride: %i %i %i %i\n",
+               CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
+               CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+               CudaNdarray_HOST_STRIDES(kern)[0],
+               CudaNdarray_HOST_STRIDES(kern)[1],
+               CudaNdarray_HOST_STRIDES(kern)[2],
+               CudaNdarray_HOST_STRIDES(kern)[3]);
+        printf("INFO:   out dim: %i %i %i %i  out stride: %i %i %i %i\n",
+               CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1],
+               CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+               CudaNdarray_HOST_STRIDES(out)[0],
+               CudaNdarray_HOST_STRIDES(out)[1],
+               CudaNdarray_HOST_STRIDES(out)[2],
+               CudaNdarray_HOST_STRIDES(out)[3]);
+    }
+
+    if (!subsample &&
+        out_contiguous &&
+        (version==3||version==4||version==5||version==-1) &&
+        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
+        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_full_patch_stack_padded
+    {
+      //version 3 without split
+      //version 4 with split (more registers)
+      //version 5 with split (more registers) low mem version(some restriction and still more register)
+        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
+        if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
+        if(kern_len==1 && version==5){
+          //version 5 don't support kern_len==1 as 1%0 return -1.
+          version=-1;
+          if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
+        }
+        if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
+
+        //we pass by ceil_intdiv in case the out_len is not a multiple
+        //of nb_split, we want nb_split the number of iteration.
+        //Max of 16k of shared memory
+        if(version==5)
+          while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
+        
+        //327 as we use 25 register
+        //version 5 will have only 1 block running at a time, so we
+        //can use 32 registers per threads, but their is some other stuff that
+        //for the limit to bu lower then 512.
+        int max_thread = (version!=5?327:450);
+        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
+        if(version==-1 && out_size>max_threads_dim0)version=4;
+        if(version==-1)version=3;
+
+
+        if(version==-1 && nb_split>1) version=4;
+        else if(version==-1) version=3;
+        //force version 4 when more than 1 split are needed to always execute.
+        else if(version==3 && nb_split!=1) version=4;
+
+        assert(version!=3 || nb_split==1);
+        assert(version!=5 || kern_len>1);
+        assert(version!=-1);
+
+        dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
+        dim3 grid(nbatch,nkern);
+
+        int shared_size=img_size_padded_byte + kern_size_byte;
+        if(version==5)
+          shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int);
+
+#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
+             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
+        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
+        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
+        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
+        else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
+        else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
+        else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
+        else assert(false);
+
+        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
+
+        f<<< grid, threads, shared_size>>>
+             (img->devdata, kern_data_unflipped, out->devdata,
+              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+              img_stride_col, img_stride_row, img_stride_stack,
+              img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
+              kern_stride_stack, kern_stride_nkern);
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts)
+        {
+          if (verbose>1)
+            fprintf(stderr,
+                    "threads.x=%i, threads.y=%i, threads.z=%i,"
+                    " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
+                    " out_len=%i, nb_split=%i, version=%i\n",
+                    threads.x, threads.y, threads.z,
+                    grid.x, grid.y, shared_size,
+                    threads.x * threads.y * threads.z,
+                    out_len, nb_split, version);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: used 'conv_full_patch_stack_padded'"
+                      " nb_split=%d low_mem=%s\n",
+                      nb_split, (version==5?"true":"false"));
+            work_complete = true;
+        }
+        else
+        {
+          if (verbose)
+            fprintf(stderr,
+                    "threads.x=%i, threads.y=%i, threads.z=%i,"
+                    " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
+                    " out_len=%i, nb_split=%i, version=%i\n",
+                    threads.x, threads.y, threads.z,
+                    grid.x, grid.y, shared_size,
+                    threads.x * threads.y * threads.z,
+                    out_len, nb_split, version);
+          if (verbose)
+            fprintf(stderr,
+                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
+                    " failed (%s), trying next implementation\n",
+                    version==3?"no split": "split",
+                    (version==5?"low_mem":"not_low_mem"),
+                    cudaGetErrorString(sts));
+        }                         
+    }
+
+    if (!subsample && c_contiguous &&
+        (version==0||version==-1) &&
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
+        nstack == 1 &&// don't implement the stack in the kernel.
+        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_full_patch
+    {
+        dim3 threads(out_wid, out_len);
+        dim3 grid(nbatch,nkern);
+        int shared_size=(img_size + kern_size)*sizeof(float);
+        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
+
+        conv_full_patch<<< grid, threads, shared_size>>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid,
+           kern_len, kern_wid,
+           nkern, nstack);
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y, shared_size,
+                      threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr,
+                      "INFO: impl 'conv_full_patch' failed (%s),"
+                      " trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }                         
+    }
+    if (false && !subsample && //disabled as test fail for this kernel
+        (version==1||version==-1) &&
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
+        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
+        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
+        !work_complete) //conv_full_load_everything
+    {
+        dim3 threads(out_wid, out_len);
+        dim3 grid(nbatch);
+        int shared_size=(img_size + kern_size)*nstack*sizeof(float);
+        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
+
+        //typeof(conv_full_load_everything<0>) f = ;
+        void (*f)(float*, float*, float*,
+                  int, int, int, int, int, int,
+                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
+
+        f = conv_full_load_everything<THEANO_KERN_WID>;
+
+        f<<< grid, threads, shared_size>>>
+          (img->devdata,
+           kern->devdata,
+           out->devdata,
+           img_len, img_wid, 
+           kern_len, kern_wid,
+           nkern, nstack,
+           CudaNdarray_HOST_STRIDES(img)[3],
+           CudaNdarray_HOST_STRIDES(img)[2],
+           CudaNdarray_HOST_STRIDES(img)[1],
+           CudaNdarray_HOST_STRIDES(img)[0],
+           CudaNdarray_HOST_STRIDES(kern)[3],
+           CudaNdarray_HOST_STRIDES(kern)[2],
+           CudaNdarray_HOST_STRIDES(kern)[1],
+           CudaNdarray_HOST_STRIDES(kern)[0]
+           );
+
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y, shared_size,
+                      threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
+                      " failed (%s), trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }
+    }
+
+    if (!subsample &&
+        img_batch_stack_contiguous &&
+        out_contiguous &&
+        (version==2||version==-1) &&
+        out_size<=max_threads_dim0 &&//Maximum of X threads by block
+        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
+        !work_complete) //conv_full_patch_stack
+    {
+        dim3 threads(out_wid, out_len);
+        dim3 grid(nbatch,nkern);
+        int shared_size=(img_size + kern_size)*sizeof(float);
+
+        void (*f)(float*, float*, float*,
+                  int, int, int, int,
+                  int, int, int, int,
+                  int, int, int, int);
+
+        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
+        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
+        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
+        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
+
+        f<<< grid, threads, shared_size>>>(
+                img->devdata,
+                kern->devdata,
+                out->devdata,
+                img_len, img_wid,
+                kern_len, kern_wid,
+                nkern, nstack,img_stride_col, img_stride_row,
+                kern_stride_col, kern_stride_row,
+                kern_stride_stack, kern_stride_nkern);
+        CNDA_THREAD_SYNC;
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
+            work_complete = true;
+        }
+        else
+        {
+            if (verbose)
+              fprintf(stderr,
+                      "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                      " shared_size=%i, nb_threads=%i\n",
+                      threads.x, threads.y, grid.x, grid.y,
+                      shared_size, threads.x * threads.y);
+            if (verbose)
+              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
+                      cudaGetErrorString(sts));
+        }                         
+    }
+    if (1 && !work_complete) //conv_reference_full
+    {
+        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
+
+        int outsize = CudaNdarray_SIZE(out);
+        int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
+        int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
+                                 NUM_VECTOR_OP_THREADS_PER_BLOCK);
+        if (0)
+        {
+            if (verbose)
+              fprintf(stderr, "INFO: launching conv_reference_valid\n");
+            if (verbose)
+              fprintf(stderr, "      img : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(img)[0],
+                      CudaNdarray_HOST_DIMS(img)[1],
+                      CudaNdarray_HOST_DIMS(img)[2],
+                      CudaNdarray_HOST_DIMS(img)[3],
+                      img->devdata,
+                      CudaNdarray_HOST_STRIDES(img)[0],
+                      CudaNdarray_HOST_STRIDES(img)[1],
+                      CudaNdarray_HOST_STRIDES(img)[2],
+                      CudaNdarray_HOST_STRIDES(img)[3]);
+            if (verbose)
+              fprintf(stderr, "      kern: %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(kern)[0],
+                      CudaNdarray_HOST_DIMS(kern)[1],
+                      CudaNdarray_HOST_DIMS(kern)[2],
+                      CudaNdarray_HOST_DIMS(kern)[3],
+                      kern->devdata,
+                      CudaNdarray_HOST_STRIDES(kern)[0],
+                      CudaNdarray_HOST_STRIDES(kern)[1],
+                      CudaNdarray_HOST_STRIDES(kern)[2],
+                      CudaNdarray_HOST_STRIDES(kern)[3]
+                        );
+            if (verbose)
+              fprintf(stderr, "      out : %i %i %i %i %p  %i %i %i %i\n",
+                      CudaNdarray_HOST_DIMS(out)[0],
+                      CudaNdarray_HOST_DIMS(out)[1],
+                      CudaNdarray_HOST_DIMS(out)[2],
+                      CudaNdarray_HOST_DIMS(out)[3],
+                      out->devdata,
+                      CudaNdarray_HOST_STRIDES(out)[0],
+                      CudaNdarray_HOST_STRIDES(out)[1],
+                      CudaNdarray_HOST_STRIDES(out)[2],
+                      CudaNdarray_HOST_STRIDES(out)[3]);
+            if (verbose)
+              fprintf(stderr, "   launch params: %i %i %i\n",
+                      outsize, n_blocks, n_threads);
+            if (verbose)
+              fprintf(stderr, "   subsample params: %i %i\n",
+                      subsample_rows, subsample_cols);
+        }
+        conv_reference_full<<<n_blocks, n_threads>>>(
+                CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
+                CudaNdarray_HOST_DIMS(img)[1],
+                CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
+                CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
+                CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
+                img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
+                CudaNdarray_HOST_STRIDES(img)[1],
+                CudaNdarray_HOST_STRIDES(img)[2],
+                CudaNdarray_HOST_STRIDES(img)[3],
+                kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
+                CudaNdarray_HOST_STRIDES(kern)[1],
+                CudaNdarray_HOST_STRIDES(kern)[2],
+                CudaNdarray_HOST_STRIDES(kern)[3],
+                out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
+                CudaNdarray_HOST_STRIDES(out)[1],
+                CudaNdarray_HOST_STRIDES(out)[2],
+                CudaNdarray_HOST_STRIDES(out)[3],
+                subsample_rows, subsample_cols);
+        CNDA_THREAD_SYNC;
+
+        cudaError_t sts = cudaGetLastError();
+        if (cudaSuccess == sts) 
+        {
+            if (verbose)
+              fprintf(stderr, "INFO: used 'conv_reference_full' version"
+                      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
+                      " nkern=%d nstack=%d subsample=%d\n",
+                      img_len,img_wid, kern_len, kern_wid,
+                      out_len, out_wid, nbatch, nkern, nstack, subsample);
+            work_complete = true;
+        }
+        else
+        {
+          if (verbose)
+            fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
+                    " shared_size=%i, nb_threads=%i\n",
+                    n_threads, 1, n_blocks, 1, 0, n_threads);
+          if (verbose)
+            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
+                    " trying next implementation\n",
+                    cudaGetErrorString(sts));
+          PyErr_Format(PyExc_RuntimeError,
+                       "ERROR: all implementations failed for"
+                       " CudaNdarray_conv_full! (%s)",
+                       cudaGetErrorString(sts));
+          return -1;
+        }
+    }
+    return 0;
+}
+
+PyObject *
+CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
+                 CudaNdarray * out, const int mode,
+                 const int subsample_rows, const int subsample_cols,
+                 const int version, const int verbose,
+                 const int max_threads_dim0 = 512
+                 )
+{
+    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
+    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
+    //
+    if (img->nd != 4)
+    {
+      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
+      return NULL;
+    }
+    if (kern->nd != 4)
+    {
+      PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
+      return NULL;
+    }
+
+    int out_dim[4];
+    out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
+    out_dim[1] = CudaNdarray_HOST_DIMS(kern)[0];
+    int logical_rows, logical_cols;
+    if (mode == ConvMode_VALID)
+    {
+        logical_rows = CudaNdarray_HOST_DIMS(img)[2] - CudaNdarray_HOST_DIMS(kern)[2] + 1;
+        logical_cols = CudaNdarray_HOST_DIMS(img)[3] - CudaNdarray_HOST_DIMS(kern)[3] + 1;
+    }
+    else
+    {
+        logical_rows = CudaNdarray_HOST_DIMS(img)[2] + CudaNdarray_HOST_DIMS(kern)[2] - 1;
+        logical_cols = CudaNdarray_HOST_DIMS(img)[3] + CudaNdarray_HOST_DIMS(kern)[3] - 1;
+    }
+    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
+    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
+
+    CudaNdarray * rval = NULL;
+
+    if ( out
+         && out->nd==4
+         && CudaNdarray_is_c_contiguous(out)
+         && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
+         && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
+         && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
+         && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
+    {
+      rval = out;
+      Py_INCREF(rval);
+      if (verbose)
+        fprintf(stderr,
+                "INFO: Conv is reusing the 'out' argument"
+                " structure.\n");
+    }
+    else
+    {
+      if (out && verbose)
+        fprintf(stderr,
+                "INFO: Conv is ignoring 'out' argument with wrong"
+                " structure.\n");
+      else if(verbose)
+        fprintf(stderr,
+                "INFO: Conv don't have an 'out' argument"
+                " structure.\n");
+
+      rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
+      //rval might be null
+    }
+    if ((rval==NULL)
+        || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval,
+                                                             subsample_rows,
+                                                             subsample_cols,
+                                                             version, verbose,
+                                                             max_threads_dim0))
+        || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval,
+                                                           subsample_rows,
+                                                           subsample_cols,
+                                                           version, verbose,
+                                                           max_threads_dim0))
+            )
+    {
+        // if rval is something we just allocated,
+        // and there was a problem, then we have to free it.
+        Py_XDECREF(rval);
+        return NULL;
+    }
+    return (PyObject*)rval;
+}
+
+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
+//we store the full image and the full kernel in the shared memory
+//each thread compute only one value for the output
+//thread block size=out_wid, out_len/nb_split
+//grid block size=batch_id
+//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+__global__ void
+conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    img+=img_len*img_wid*batch_id;//the good batch
+
+    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
+    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
+    __syncthreads();
+
+    for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
+      float sum = 0.0f;
+      int img_row = out_row;
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+        int inverse_row = (img_row-row);
+        if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
+
+        const float* idx_in=&d_img[inverse_row*img_wid];
+        const float* idx_kern=&d_kern[row*kern_wid];
+        int img_col = out_col;
+        int col=0,last=0;
+        for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
+          if(last<0 ||last>=(img_wid))continue;//col outside the image        
+          sum+=idx_in[last]*idx_kern[col];
+        }
+      }
+      out[batch_id*out_len*out_wid+//the output image
+          out_row*out_wid+out_col] = sum;
+    }
+}
+
+//we store the full image and the full kernel in the shared memory
+//each thread compute only one value for the output
+//thread block size=out_wid, out_len
+//grid block size=batch_id, nkern
+//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+__global__ void
+conv_full_patch( float* img, float* kern, float* out,
+                 int img_len, int img_wid,
+                 int kern_len, int kern_wid, int nkern, int nstack)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
+    img+=img_len*img_wid*batch_id;//the good batch
+
+    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
+    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
+    __syncthreads();
+
+    float sum = 0.0f;
+
+    for (int row=0; row < kern_len; row++) {//loop over row
+      if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
+
+      const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
+      const float* idx_kern=&d_kern[row*kern_wid];
+      int col=0;
+      int max_col=kern_wid;
+      int img_col=out_col-kern_wid+1;
+      max_col=min(max_col,img_wid-img_col);
+      
+      if(img_col<0){col=-img_col;img_col+=col;}
+      for (; col < max_col; col++, img_col++) {//loop over col
+        sum+=idx_in[col]*idx_kern[col];
+      }
+    }
+    out[batch_id*out_wid*out_len*nkern+//the good batch
+        out_wid*out_len*blockIdx.y+//the output image
+        out_row*out_wid+out_col] = sum;
+}
+
+//we store the full image and the full kernel in the shared memory
+//each thread compute only one value for the output
+//thread block size=out_wid, out_len
+//grid block size=batch_id, nkern
+//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
+
+template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
+__global__ void
+conv_full_patch_stack( float* img, float* kern, float* out,
+                       int img_len, int img_wid,
+                       int kern_len, int kern_wid, int nkern, int nstack,
+                       int img_stride_col, int img_stride_row,
+                       int kern_stride_col, int kern_stride_row, 
+                       int kern_stride_stack, int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
+  float __shared__ *kern_, *img_;
+  extern __shared__ float s_data[];
+
+    const int batch_id = blockIdx.x;
+    const int nkern_id = blockIdx.y;
+
+
+    const int out_col = threadIdx.x;
+    const int out_row = threadIdx.y;
+    const int thread_id  = threadIdx.y*blockDim.x+ threadIdx.x;
+
+    float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+    kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
+    img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
+
+    float sum = 0.0f;
+
+    for (int stack = 0;stack<nstack;stack++){
+
+      load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
+      load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
+      __syncthreads();
+
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+        if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
+        const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
+        const float* idx_kern=&d_kern[row*kern_wid];
+        int col=0;
+        int max_col=kern_wid;
+        int img_col=out_col-kern_wid+1;
+        max_col=min(max_col,img_wid-img_col);
+
+        if(img_col<0){col=-img_col;img_col+=col;}
+        for (; col < max_col; col++, img_col++) {//loop over col
+          sum+=idx_in[col]*idx_kern[col];
+        }
+      }
+      //Needed as not all thread finish at the same time the loop
+      //And we don't want to overwrite the shared memory.
+      __syncthreads();
+    }
+    out[batch_id*out_wid*out_len*nkern+//the good batch
+        out_wid*out_len*blockIdx.y+//the output image
+        out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
+ * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
+ * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
+ * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
+ * template split: if true, each thread compute more than 1 output pixel.
+ * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
+ *          if you set split and low_mem to true, we will use the low_mem version!
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
+__global__ void
+conv_full_patch_stack_padded( float* img, float* kern, float* out,
+                  const int img_len, const int img_wid,
+                  const int kern_len, const int kern_wid,
+                  const int nkern, const int nstack,
+                  const int img_stride_col, const int img_stride_row,
+                  const int img_stride_stack, const int img_stride_batch,
+                  const int kern_stride_col, const int kern_stride_row,
+                  const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len + kern_len - 1;
+  out_wid = img_wid + kern_wid - 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
+    batch_id = blockIdx.x;
+    kern_id = blockIdx.y;
+    nb_rows = blockDim.y;
+
+    // Thread index
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    const int thread_id  = ty*blockDim.x + tx;
+
+    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
+    float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
+
+    kern+=kern_stride_nkern*kern_id;//the good nkern
+    img+=img_stride_batch*batch_id;//the good batch
+
+    img_wid_valid=img_wid+2*kern_wid-2;
+
+    if(!split && !low_mem){
+      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
+      const int out_row = ty;//output row
+      float sum = 0.0f;
+      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+             img+=img_stride_stack){
+          __syncthreads();
+        load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
+                                  thread_id,nb_thread_id,img_wid,img_len,
+                                  img_stride_col, img_stride_row, kern_wid-1,
+                                  c_contiguous);
+        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
+
+        for (int row=0; row < kern_len; row++) {//loop over row
+          const float* idx_kern=&d_kern[row*kern_wid];
+          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+          
+          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+        }
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+          kern_id*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
+    }else if(split && !low_mem){
+      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
+      __shared__ int out_len_max;
+      //TODO pass a parameter nb_split
+      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
+      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
+        float sum = 0.0f;
+        for (int stack = 0;stack<nstack;stack++){
+          __syncthreads();
+          //TODO: load only the part of the image needed or put the partial result in shared memory
+          load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
+                                    img+img_stride_stack*stack,
+                                    thread_id,nb_thread_id,img_wid,img_len,
+                                    img_stride_col, img_stride_row, kern_wid-1,
+                                    c_contiguous);
+          load_to_shared(d_kern, kern+kern_stride_stack*stack,
+                         thread_id, nb_thread_id, kern_wid,kern_len,
+                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+          __syncthreads();
+          //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
+          //Not needed on generation before as they worked anyway. Removing the if generate the good code
+          //as we store the result of only the good thread.
+          //This was with nvcc 3.0 on an GTX470 card.
+          if(out_row<out_len)
+            for (int row=0; row < kern_len; row++) {//loop over row
+              const float* idx_kern=&d_kern[row*kern_wid];
+              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+              
+              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+            }
+          if(out_row<out_len)
+            out[batch_id*out_wid*out_len*nkern+//the good batch
+                out_wid*out_len*kern_id+//the output image
+                out_row*out_wid+out_col] = sum;
+        }
+      }
+    }else{//low_mem version
+      //don't need to fill the last rows padding as this is done later.
+      fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
+      __shared__ int out_len_max;
+      //TODO pass a parameter nb_split
+      if(thread_id==0)
+        out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
+      __syncthreads();
+      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
+          out_row+=nb_rows, out_row_iter++){
+        float sum = 0.0f;
+        for (int stack = 0;stack<nstack;stack++){
+          __syncthreads();
+          const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
+          const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
+          //we need to reload some row as when we change of out_row we lost the last load du to the stack.
+          const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
+          load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
+                                    img+img_stride_stack*stack//the good stack image
+                                    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
+                                    thread_id,nb_thread_id,img_wid,
+                                    len_to_load+previous_row,
+                                    img_stride_col, img_stride_row, kern_wid-1,
+                                    c_contiguous);
+          //TODO: fill the last row padding only when needed.
+          //We always fill the last rows padding event when not needed.
+          int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
+          row_to_fill = min(row_to_fill,kern_len-1);
+          fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
+               img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
+          load_to_shared(d_kern, kern+kern_stride_stack*stack,
+                         thread_id, nb_thread_id, kern_wid,kern_len,
+                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+          __syncthreads();
+
+          for (int row=0; row < kern_len; row++) {//loop over row
+            const float* idx_kern=&d_kern[row*kern_wid];
+            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
+            
+            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+          }
+        }
+        if(out_row<out_len)
+          out[batch_id*out_wid*out_len*nkern+//the good batch
+              out_wid*out_len*kern_id+//the output image
+              out_row*out_wid+out_col] = sum;
+      }
+    }
+}
+
+template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
+{ 
+    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
+    //return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
+}
+template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
+{ 
+    return 0;
+}
+template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
+{ 
+    return x[0] * y[0];
+}
+template<int NSTACK>
+__global__ void
+conv_full_load_everything( float* img, float* kern, float* out,
+                 int img_len, int img_wid,
+                 int kern_len, int kern_wid, int nkern, int nstack,
+                 int img_stride_col, int img_stride_row,
+                 int img_stride_stack, int img_stride_batch,
+                 int kern_stride_col, int kern_stride_row, 
+                 int kern_stride_stack, int kern_stride_nkern)
+{
+    int __shared__ out_len, out_wid, nb_thread_id;
+    out_len = img_len + kern_len - 1;
+    out_wid = img_wid + kern_wid - 1;
+    nb_thread_id = blockDim.y*blockDim.x;
+
+    extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = threadIdx.y;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
+
+    img += blockIdx.x * img_stride_batch;//the good batch
+
+    // load the image to shared memory
+    for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
+    {
+        int stack = i / (img_wid*img_len);
+        int row = (i % (img_wid*img_len)) / img_wid;
+        int col = (i % (img_wid*img_len)) % img_wid;
+        d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
+    }
+
+    for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
+    {
+        // load the kernel into shared memory and flip it
+        for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
+        {
+            int stack = i / (kern_wid*kern_len);
+            int row = (i % (kern_wid*kern_len)) / kern_wid;
+            int col = (i % (kern_wid*kern_len)) % kern_wid;
+            d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
+               = kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
+        }
+        __syncthreads();
+
+        float sum = 0.0f;
+        for (int row=0; row < kern_len; ++row)
+        {
+            int irow = out_row - kern_len+1+row;
+            if (irow < 0 || irow > img_len) continue;
+            for (int col = 0; col < kern_wid; ++col)
+            {
+                int icol = out_col - kern_wid+1+col;
+                if (icol < 0 || icol > img_wid) continue;
+                if (NSTACK > 0)
+                {
+                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
+                            d_kern + row*kern_wid+col, kern_len*kern_wid);
+                }
+                else
+                {
+                    for (int stack = 0; stack < nstack; ++stack)
+                    {
+                        sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
+                    }
+                }
+            }
+        }
+        out[batch_id*out_wid*out_len*nkern+//the good batch
+            out_wid*out_len*kern_idx+//the output image
+            out_row*out_wid+out_col] = sum;
+        __syncthreads(); //don't start loading another kernel until we're done here
+    }
+}
+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
+// REMEMBER TO RAISE c_code_cache_version when changing this file
+//
+//implement the valid convolution only
+
+/*
+for (int iter_m=0; iter_m < Os[0]; iter_m++) {
+  // Reposition index into input image based on requested output size
+  int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
+  int new_m = (pos_m+dim_ker[0]-1);
+
+  for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
+    int pos_n=iter_n*%(self_dy)s;
+    %(type)s sum=0;
+
+    // Sum over kernel, if index into image is out of bounds
+    // fill with the value
+    for (int j=0; j < dim_ker[0]; j++) {
+      int inverse_row = (new_m-j);
+      const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
+      const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
+      int new_n = (pos_n+dim_ker[1]-1);
+      for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
+        sum+=idx_kern[k]*idx_in[last];
+      }
+    }//for j
+    out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
+  }//for n
+ }//for m
+*/
+#ifndef CONV_KERNEL_CU
+#define CONV_KERNEL_CU
+#include <stdint.h>
+
+/*
+#define CHECK_BANK_CONFLICTS 0
+#if CHECK_BANK_CONFLICTS
+#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
+#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
+#else
+#define AS(i, j) As[i][j]
+#define BS(i, j) Bs[i][j]
+#endif
+*/
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b) )
+#define MAX(a, b) ((a) < (b) ? (b) : (a) )
+
+//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
+//bit, it is 32 bit.
+const uintptr_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
+
+__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
+  if (nb_thread < 64)
+    {
+      if(flipped)
+        //TODO very slow on device before 1.3.
+        //     make access to kern sequential and access to d_kern flipped.
+        for(int i=thread_id;i<N;i+=nb_thread)
+          dst[i]=src[N - 1 - i];
+        //dst[N-1-i]=src[i];
+      else
+      {
+        for(int i = thread_id; i < N; i += nb_thread)
+        {
+            dst[i] = src[i];
+        }
+      }
+    }
+  else
+    {
+      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
+      // Global memory:
+      //  <-------------------------------------->
+      //      A      A      A      A      A   // points of 256-byte alignment
+      //         dddddddddddddddddddddd       // layout of src in global memory
+      if (thread_id < nb_thread)
+        {
+          const float * my_src_ptr = (const float *)(
+                  ((uintptr_t)src) & COALESCED_ALIGN);
+          my_src_ptr += thread_id;
+          while (my_src_ptr < src + N)
+          {
+              if (my_src_ptr >= src)
+              {
+                  int i = my_src_ptr - src;
+                  if (flipped)
+                  {
+                      dst[N - 1 - i] = *my_src_ptr;
+                  }
+                  else
+                  {
+                      dst[i] = *my_src_ptr;
+                  }
+              }
+              my_src_ptr += nb_thread;
+          }
+        }
+    }
+}
+
+/*
+ * We load from global memory to shared memory. The outer if is optimized away at compilation.
+ */
+__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
+                               int nb_thread, const int nb_col, const int nb_row,
+                               const int stride_col, const int stride_row,
+                               const bool flipped=false, const bool c_contiguous=true){
+    if (c_contiguous)
+    {
+        load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
+    }
+    else
+    {
+        if (flipped)
+        {
+            int LAST = nb_row * nb_col - 1;
+            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+            {
+                // XXX
+                // THIS IS SLOW - use whatever blocks are in the the
+                // threads to avoid division and modulo
+                dst[LAST - i] \
+                    = src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
+            }
+        }
+        else
+        {
+            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
+            {
+                // XXX
+                // THIS IS SLOW - use whatever blocks are in the the
+                // threads to avoid division and modulo
+                dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
+            }
+        }
+    }
+}
+
+__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
+  for(int i=thread_id;i<N;i+=nb_thread)
+    dst[i]=value;
+}
+
+/*
+ * We load from global memory to shared memory. The outer if is optimized away at compilation.
+ * We put the image at the center of another one. Usefull to padd an image with 0.
+ */
+__device__ void load_padded_col_to_shared(float * dst, const float * src, 
+                                          const int thread_id, const int nb_thread,
+                                          const int nb_col, const int nb_row, 
+                                          const int stride_col, const int stride_row,
+                                          const int wid_pad, const bool c_contiguous=true){
+  if(c_contiguous){//flipped==false
+    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
+      int col=i%nb_col;
+      int row=i/nb_col;
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
+    }
+    
+  }else{
+    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
+      int col=i%nb_col;
+      int row=i/nb_col;
+      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
+    }
+  }
+
+}
+
+template<int i> __device__ float convolutionRowNoFlip(const float *data,
+                                                      const float *kern){
+    return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
+  //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
+}
+
+template<> __device__ float convolutionRowNoFlip<1>(const float *data,
+                                                    const float *kern){
+    return data[0]*kern[0];
+}
+template<> __device__ float convolutionRowNoFlip<0>(const float *data,
+                                                    const float *kern){
+    return 0;
+}
+
+template<int KERN_WIDTH>
+__device__ void convolutionRowNoFlip(float& sum,
+                                     const float *data,
+                                     const float *kern, const int kern_wid){
+  if(KERN_WIDTH>0)
+    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
+  else
+#pragma unroll 8
+    for (int col=0; col < kern_wid; col++) {//loop over col
+      sum+=data[col]*kern[col];
+    }
+}
+
+template<bool accumulate>
+__device__ void store_or_accumulate(float& dst,const float value ){
+  if(accumulate){
+    dst += value;
+  }else
+    dst = value;
+}
+
+
+/**
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * Don't implement the stack.
+ * each thread compute only one value for the output if split is false
+ * thread block size=out_wid, out_len(or less then out_len if split is true)
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: img_len*img_wid+kern_len*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template split: if true, each thread computes more than 1 output pixel
+ *                 When true, allow for output image bigger then 512 pixel.
+ *                 Use more registers.
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool split>
+__global__ void
+conv_patch( float* img, float* kern, float* out,
+            int img_len, int img_wid, int kern_len, int kern_wid,
+            int nkern, int nstack)
+{
+  int __shared__ out_len, out_wid, nb_thread_id;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    __shared__ int batch_id, kern_id;
+    batch_id = blockIdx.x;
+    kern_id = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    const int thread_id  = ty*blockDim.x + tx;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    kern+=kern_len*kern_wid*nstack*kern_id;
+
+    img+=img_len*img_wid*(nstack*batch_id);
+
+    load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
+    load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
+    __syncthreads();
+
+    if(!split){
+      int out_row = ty;//output row
+      float sum = 0.0f;
+      for (int row=0; row < kern_len; row++) {//loop over row
+        const float* idx_kern=&d_kern[row*kern_wid];
+        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+          blockIdx.y*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
+    }else{
+      for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
+        float sum = 0.0f;
+        for (int row=0; row < kern_len; row++) {//loop over row
+          const float* idx_kern=&d_kern[row*kern_wid];
+          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        }
+        out[batch_id*out_wid*out_len*nkern+//the good batch
+            kern_id*out_wid*out_len+//the output image
+            out_row*out_wid+out_col] = sum;
+      }
+    }
+}
+
+/**
+ * As conv_patch, but implement the stack in the kernel.
+ * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * each thread compute only one value for the output if split==false else it compute more than 1 values
+ * thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * dx: patch stride rows(1 for normal convolution)
+ * dy: patch stride cols(1 for normal convolution)
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template accumulate: if true, we add the result, else we override the result
+ * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
+ * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
+ * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
+ * template split: if true, each thread generate more than 1 output pixel, but use more registers.
+ * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
+ * template subsample: if false, remove some computation needed when dx or dy!=1.
+ */
+template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
+__global__ void
+conv_patch_stack( float* img, float* kern, float* out,
+                  int img_len, int img_wid, int kern_len, int kern_wid,
+                  int out_len, int out_wid,
+                  int nkern, int nstack, int img_stride_col,int img_stride_row,
+                  int img_stride_stack, int img_stride_batch,
+                  int kern_stride_col, int kern_stride_row,
+                  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
+{
+  int __shared__ nb_thread_id;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+    int kern_id = blockIdx.y;
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = out_row*out_wid + out_col;
+
+    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
+    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
+
+    if(!split){
+      kern+=kern_stride_nkern*kern_id;//the good nkern
+      img+=img_stride_batch*batch_id;//the good batch
+      float sum = 0.0f;
+      
+      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+             img+=img_stride_stack){
+        load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
+                       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
+        if(preload_full_kern)
+          load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+                         kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+
+        __syncthreads();
+        
+        for (int row=0; row < kern_len; row++) {//loop over row
+          if(!preload_full_kern){
+            __syncthreads();
+            int idx2;
+            if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
+            else idx2=(row)*kern_stride_row;
+            load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
+                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+            __syncthreads();              
+          }
+
+          const float* idx_kern;
+          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+          else idx_kern=d_kern;
+          const float* idx_in;
+          if(subsample)
+            idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+          else
+            idx_in=&d_img[(row+out_row)*img_wid+out_col];
+          
+          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        }
+        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+      }
+      store_or_accumulate<accumulate>(
+                                      out[batch_id*out_wid*out_len*nkern+//the good batch
+                                          out_wid*out_len*kern_id+//the output image
+                                          out_row*out_wid+out_col],sum);
+    }else{
+
+      float __shared__ *kern_, *img_;
+      int __shared__ out_len_max;
+
+      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
+      img_=img+img_stride_batch*batch_id;//the good batch
+      //out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
+      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
+
+      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
+      //TODO: do this happen elsewhere?
+      for(;out_row<out_len_max;out_row+=blockDim.y){
+        float sum = 0.0f;
+        for (int stack = 0;stack<nstack;stack++){
+          //TODO: load only the part of the image needed or put the partial result in shared memory
+          int idx1=img_stride_stack*stack;
+          load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
+                         img_stride_col, img_stride_row, false, img_c_contiguous_2d);
+          if(preload_full_kern){
+            int idx2=kern_stride_stack*stack;
+            load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
+                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+          }
+          __syncthreads();
+          
+          for (int row=0; row < kern_len; row++) {//loop over row
+            if(!preload_full_kern){
+              __syncthreads();
+              int idx2=kern_stride_stack*stack;
+              if(flipped_kern)
+                idx2+=(kern_len-row-1)*kern_stride_row;
+              else
+                idx2+=(row)*kern_stride_row;
+              load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
+                             kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
+              __syncthreads();              
+            }
+            const float* idx_kern;
+            if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+            else idx_kern=d_kern;
+            const float* idx_in;
+            if(subsample)
+              idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+            else
+              idx_in=&d_img[(row+out_row)*img_wid+out_col];
+            
+            //if needed as on Fermi as reading out of bound index from shared memory generate an error.
+            //Not needed on generation before as they worked anyway. Removing the if generate the good code
+            //as we store the result of only the good thread.
+            //This was with nvcc 3.0 on an GTX470 card.
+            if(out_row<out_len)
+              convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          }
+          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+        }
+        if(out_row<out_len)
+          store_or_accumulate<accumulate>(
+                                          out[batch_id*out_wid*out_len*nkern+//the good batch
+                                              out_wid*out_len*kern_id+//the output image
+                                              out_row*out_wid+out_col],sum);
+      }
+
+    }
+
+}
+
+/**
+ * As conv_patch_stack, but kern_len thread for each output pixel
+ * I keep it separated as use more register.
+ * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
+ * thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
+ * grid block size=batch_id, nkern
+ * dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
+ * 
+ * nkern: the number of kernel, used to compute the output image to store the result
+ * nstack: the size of the stack, used to compute the image to load.
+ * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
+ * template img_contiguous: if true, the img have are collon and row contiguous
+ * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
+ */
+template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
+__global__ void
+conv_patch_stack_reduce( float* img, float* kern, float* out,
+                  int img_len, int img_wid, int kern_len, int kern_wid,
+                  int nkern, int nstack, int img_stride_col,int img_stride_row,
+                  int img_stride_stack, int img_stride_batch,
+                  int kern_stride_col, int kern_stride_row,
+                  int kern_stride_stack, int kern_stride_nkern)
+{
+  //int __shared__ out_len, out_wid, nb_thread_id;
+  //out_len = img_len - kern_len + 1;
+  //out_wid = img_wid - kern_wid + 1;
+  const int out_wid = blockDim.x;
+  const int out_len = blockDim.y;
+  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+
+  extern __shared__ float s_data[];
+
+    int batch_id = blockIdx.x;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+    int tz = threadIdx.z;
+
+    int out_col = tx;//output col
+    int out_row = ty;//output row
+    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
+
+    //d_img size [IMAGE_LEN * IMAGE_WID];
+    float * d_img=&s_data[0];
+
+    //d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
+    float * d_kern=&s_data[img_len * img_wid];
+
+    //d_reduce size [n_threads]
+    //N.B. this overlaps with d_img and d_kern!
+    float * d_reduce=&s_data[0];
+
+    float sum = 0.0f;
+
+    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
+    img+=img_stride_batch*batch_id;//the good batch
+
+    for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
+           img+=img_stride_stack){
+      __syncthreads();
+      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
+                     img_stride_col, img_stride_row, false, c_contiguous);
+      if(split && ! preload_full_kern){
+        for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
+            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
+            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
+            // details, but that seemed really weird. tricky bug to find too.
+          int idx3 = flipped_kern
+              ? max((kern_len - (int)blockDim.z - first_row),0)
+              : first_row;
+          int len3 = min(blockDim.z, kern_len - first_row);
+
+          __syncthreads();
+          load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
+                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+          __syncthreads();
+          const float* idx_kern=&d_kern[tz*kern_wid];
+          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
+          float sum2 = 0;
+          if(tz<len3)
+            convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
+          sum+=sum2;
+        }
+      }else if(split){
+        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
+        for(int row=tz;row<kern_len;row+=blockDim.z){
+          const float* idx_kern=&d_kern[row*kern_wid];
+          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        }
+      }else{
+        int row = tz;//The row of the kernel.
+        const float* idx_kern=&d_kern[row*kern_wid];
+        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
+                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
+        __syncthreads();
+        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      }
+        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
+    }
+
+    //reduce no sync because previous loop ends with sync
+    d_reduce[thread_id]=sum;
+    __syncthreads();
+    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
+      //sum=0;
+      for(int i=1;i<blockDim.z;i++){
+        sum+=d_reduce[thread_id+i*out_wid*out_len];
+      }
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+          out_wid*out_len*blockIdx.y+//the output image
+          out_row*out_wid+out_col] = sum;
+    }
+}
+
+/**
+ * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
+ * we store kern_len row of the image and the full kernel in the shared memory
+ * each thread compute only one value for the output
+ * Don't implement the stack and nkern in the kernel.
+ * thread block size=out_wid
+ * grid block size=out_len,batch_id
+ * dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
+ * Diff with conv_patch: don't store the full image in the shared memory. 
+ *    I.E. work for bigger image then conv_patch<split=true,...>.
+ */
+template<int KERN_WIDTH, bool c_contiguous>
+__global__ void
+conv_rows( float* img, float* kern, float* out,
+           int img_len, int img_wid, int kern_len, int kern_wid,
+           int nkern, int nstack,
+           int img_stride_col, int img_stride_row,
+           int img_stride_stack, int img_stride_batch,
+           int kern_stride_col, int kern_stride_row,
+           int kern_stride_stack, int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
+  float __shared__ *d_img, *d_kern;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y%nkern;
+
+  extern __shared__ float s_data[];
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = blockIdx.x;;//output row
+    const int thread_id = threadIdx.x;
+
+    d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
+    d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+    
+    img+=img_stride_batch*batch_id;//selection the good image from the batch
+    img+=out_row*img_stride_row;//select the good top row.
+    kern+=kern_stride_nkern*kern_id;//the good nkern
+
+    load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
+                   img_stride_col, img_stride_row, false, c_contiguous);
+    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
+                   kern_stride_col, kern_stride_row, true, c_contiguous);
+
+    __syncthreads();
+    float sum = 0.0f;
+
+    for (int row=0; row < kern_len; row++) {//loop over row
+      const float* idx_kern=&d_kern[row*kern_wid];
+      const float* idx_in=&d_img[(row)*img_wid+out_col];
+      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+    }
+
+    out[batch_id*out_wid*out_len*nkern+//the good batch
+        kern_id*out_wid*out_len+//the output image
+        out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
+ * as conv_rows, but implement the stack. Separate as this use more register.
+ * we store kern_len row of the image and the full kernel in the shared memory
+ * each thread compute only one value for the output
+ * thread block size=out_wid, block_len
+ * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
+ * dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
+ * Diff with conv_patch: don't store the full image in the shared memory. 
+ *    I.E. work for bigger image then conv_patch<split=true,...>.
+ */
+template<int KERN_WIDTH, bool c_contiguous>
+__global__ void
+conv_rows_stack( float* img, float* kern, float* out,
+                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
+                 const int nkern, const int nstack,
+                 const int img_stride_col, const int img_stride_row,
+                 const int img_stride_stack, const int img_stride_batch,
+                 const int kern_stride_col, const int kern_stride_row,
+                 const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
+  float  __shared__ *d_img, *d_kern;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y%nkern;
+  nb_rows = blockDim.y;
+
+  int rows_to_read = MIN(
+          kern_len + nb_rows - 1,
+          img_len - blockIdx.x * nb_rows);
+
+  /**
+   * Every thread ultimately computes one value in the output, at coordinates
+   *   out[ batch_id, kern_id, out_row, out_col]
+   *
+   * The batch_id and kern_id are packed into blockIdx.y. out_row and out_col
+   * are the threadIdx.x and threadIdx.y.
+   *
+   * Every thread block deals only with one image, and one filter kernel.
+   */
+  extern __shared__ float s_data[];
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
+    const int shared_row = threadIdx.y;
+    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
+
+  /*
+   * The kernel works by looping over channels (aka colours, aka the stack).
+   * On each iteration, a thread block loads one channel of all the image rows that
+   * it needs to use, and one channel slice of one kernel.
+   */
+    d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
+    d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
+
+    float sum = 0.0f;
+    for (int stack = 0; stack < nstack; stack++){
+
+      int offset =
+          img_stride_batch * batch_id
+          + img_stride_stack * stack
+          //blockIdx.x is which chunk of nb_rows this thread block deals with
+          + img_stride_row * (blockIdx.x * nb_rows);
+
+      load_to_shared(
+              d_img,              // dst
+              img+offset,         // src
+              thread_id,          // linear position in block
+              nb_thread_id,       // number of threads
+              img_wid,            // cols in image to read
+              rows_to_read,       // number of rows to read
+              img_stride_col,     // img[i, j, k, l] to img[i, j, k, l + 1]
+              img_stride_row,     // img[i, j, k, l] to img[i, j, k + 1, l]
+              false,              // flip while reading
+              c_contiguous);
+
+      offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
+      load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
+                     kern_stride_col, kern_stride_row, true, c_contiguous);
+
+      __syncthreads();
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+        const float* idx_kern=&d_kern[row*kern_wid];
+        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
+        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      }
+      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
+    }
+    if (out_row < out_len)
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+          kern_id*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
+ * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
+ * we store block_len row of the image(at a time) and one or all kernel row in the shared memory
+ * each thread compute only one value for the output
+ * thread block size=out_wid, block_len
+ * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
+ * dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
+ * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
+ *    I.E. work for bigger image then conv_patch<split=true,...>.
+ */
+template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
+__global__ void
+conv_rows_stack2( float* img, float* kern, float* out,
+                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
+                 const int nkern, const int nstack,
+                 const int img_stride_col, const int img_stride_row,
+                 const int img_stride_stack, const int img_stride_batch,
+                 const int kern_stride_col, const int kern_stride_row,
+                 const int kern_stride_stack, const int kern_stride_nkern)
+{
+  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
+  float  __shared__ *d_img, *d_kern;
+  out_len = img_len - kern_len + 1;
+  out_wid = img_wid - kern_wid + 1;
+  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
+  batch_id= blockIdx.y/nkern;
+  kern_id = blockIdx.y%nkern;
+  nb_rows = blockDim.y;
+
+  extern __shared__ float s_data[];
+
+    const int out_col = threadIdx.x;//output col
+    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
+    const int shared_row = threadIdx.y;
+    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
+
+    d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
+    d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
+    
+    float sum = 0.0f;
+    for (int stack = 0;stack<nstack;stack++){
+
+      int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
+      _idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
+    
+      __syncthreads();
+      load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
+                           img_stride_col, img_stride_row, false, c_contiguous);
+      if(preload_full_kern)
+        load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
+                       thread_id, nb_thread_id, kern_wid,kern_len,
+                       kern_stride_col, kern_stride_row, true, c_contiguous);
+      __syncthreads();
+
+      for (int row=0; row < kern_len; row++) {//loop over row
+        __syncthreads();
+        if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
+          int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
+          _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
+          _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
+          load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
+                         img+_idx1, thread_id, nb_thread_id, img_wid, 1,
+                         img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
+        }
+
+        if(!preload_full_kern){
+          int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
+          _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
+          load_to_shared(d_kern, kern+_idx3,
+                         thread_id, nb_thread_id, kern_wid,1,
+                         kern_stride_col, kern_stride_row, true, c_contiguous);
+
+        }
+        __syncthreads();
+
+        //if needed as on Fermi as reading out of bound index from shared memory generate an error.
+        //Not needed on generation before as they worked anyway. Removing the if generate the good code
+        //as we store the result of only the good thread.
+        //This was with nvcc 3.0 on an GTX470 card.
+        if(out_row<out_len){
+          const float* idx_kern;
+          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
+          else idx_kern=d_kern;
+          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
+          float sum_ =0.0f;
+          convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
+          sum+=sum_;//We pass by an intermediate variable to have more precission.
+        }
+      }
+    }
+    __syncthreads();
+    if(out_row<out_len)
+      out[batch_id*out_wid*out_len*nkern+//the good batch
+          kern_id*out_wid*out_len+//the output image
+          out_row*out_wid+out_col] = sum;
+}
+
+/**
+ * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
+ * kernel-image inner-product in parallel.
+ * 
+ * This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
+ *
+ * template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
+ * TODO: explain parameters, preconditions
+ */
+template<bool stack_loop>
+__global__ void
+conv_valid_row_reduce(int nB, int nK, int stacklen,
+        int img_len, int img_wid, 
+        int kern_len, int kern_wid,
+        int out_len, int out_wid, //physical
+        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
+        int subsample_rows, int subsample_cols,
+        const int initial_reduce_boundary)
+{
+    const int outsize = nB * nK * out_len * out_wid;
+    extern __shared__ float reducebuf[];
+    for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
+    {
+        //figure out what output element we're in charge of computing
+        int ii = i;
+        int iB = ii % nB;      // output batch index
+        ii = ii / nB;
+        int iK = ii % nK;      // output kernel index
+        ii = ii / nK;
+        int iR_physical = ii % out_len; //output kernel row
+        int iC_physical = ii / out_len; // output kernel column
+        int iR_logical = iR_physical * subsample_rows;
+        int iC_logical = iC_physical * subsample_cols;
+
+        int ss = threadIdx.x;
+        int rr = threadIdx.y;
+        int img_rr = iR_logical + kern_len - 1 - rr;
+        int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
+        float sum = 0.0f;
+        if(stack_loop){
+          for (; ss < stacklen; ss+=blockDim.x){
+            float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+            float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+            for (int cc = 0; cc < kern_wid; ++cc)
+            {
+                sum +=  kk_0[0] * ii_0[0];
+                kk_0 += kern_str_C;
+                ii_0 -= img_str_C;
+            }
+          }
+        }else{
+          float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
+          float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
+          for (int cc = 0; cc < kern_wid; ++cc)
+          {
+            sum +=  kk_0[0] * ii_0[0];
+            kk_0 += kern_str_C;
+            ii_0 -= img_str_C;
+          }
+        }
+
+        if (blockDim.x * blockDim.y == 1)
+        {
+            out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
+        }
+        else
+        {
+            reducebuf[reduceIdx] = sum;
+            __syncthreads();
+            int reduce_boundary = initial_reduce_boundary;
+
+            // add in the terms above the reduce boundary
+            if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
+                reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
+            reduce_boundary >>= 1;
+            // there are an equal number of terms above and below the reduce_boundary
+            while (reduce_boundary)
+            {
+                __syncthreads();
+                if (reduceIdx < reduce_boundary)
+                {
+                    reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
+                }
+                reduce_boundary >>= 1;
+            }
+            if (reduceIdx == 0)
+            {
+                out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
+            }
+        }
+    }
+}
+
+
+
+/**
+ * Reference implementation of 'valid' mode convolution (with stack)
+ * 
+ * This implementation works for any size of image and kernel.  It does not use shared memory.
+ *
+ * TODO: explain parameters, preconditions
+ */
+__global__ void
+conv_reference_valid(int nB, int nK, int stacklen,
+        int img_len, int img_wid, 
+        int kern_len, int kern_wid,
+        int out_len, int out_wid, //physical
+        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
+        int subsample_rows, int subsample_cols)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    __shared__ int numThreads, outsize;
+    numThreads = blockDim.x * gridDim.x;
+    outsize = nB * nK * out_len * out_wid;
+
+    for (int i = idx; i < outsize; i += numThreads)  //physical
+    {
+        //figure out what output element we're in charge of computing
+        int ii = i;
+        int iB = ii % nB;      // output batch index
+        ii = ii / nB;
+        int iK = ii % nK;      // output kernel index
+        ii = ii / nK;
+        int iR_physical = ii % out_len; //output kernel row
+        int iC_physical = ii / out_len; // output kernel column
+        int iR_logical = iR_physical * subsample_rows;
+        int iC_logical = iC_physical * subsample_cols;
+
+        float sum = 0.0f;
+        for (int ss = 0; ss < stacklen; ++ss)
+        {
+            for (int rr = 0; rr < kern_len; ++rr)
+            {
+                int img_rr = iR_logical + kern_len - 1 - rr;
+                for (int cc = 0; cc < kern_wid; ++cc)
+                {
+                    int img_cc = iC_logical + kern_wid-1-cc;
+                    float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
+                    float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
+                    sum +=  k_0 * i_0;
+                }
+            }
+        }
+        //coords[i*5+0] = iB;
+        //coords[i*5+1] = iK;
+        //coords[i*5+2] = iR;
+        //coords[i*5+3] = iC;
+        //coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
+        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
+    }
+}
+
+/**
+ * Reference implementation of 'full' mode convolution (with stack)
+ * 
+ * This implementation works for any size of image and kernel.  It does not use shared memory.
+ *
+ * TODO: explain parameters, preconditions
+ */
+__global__ void
+conv_reference_full(int nB, int nK, int stacklen,
+        int img_len, int img_wid, 
+        int kern_len, int kern_wid,
+        int out_len, int out_wid, //physical dimensions
+        float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
+        float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
+        float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C,
+        int subsample_rows, int subsample_cols)
+{
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    __shared__ int numThreads, physical_outsize;
+    numThreads = blockDim.x * gridDim.x;
+    physical_outsize = nB * nK * out_len * out_wid;
+
+    for (int i = idx; i < physical_outsize; i += numThreads) 
+    {
+        //figure out what output element we're in charge of computing
+        int ii = i;
+        int iB = ii % nB;      // output batch index
+        ii = ii / nB;
+        int iK = ii % nK;      // output kernel index
+        ii = ii / nK;
+        int iR_physical = ii % out_len; //output kernel row
+        int iC_physical = ii / out_len; // output kernel column
+        int iR_logical = iR_physical * subsample_rows;
+        int iC_logical = iC_physical * subsample_cols;
+
+        float sum = 0.0f;
+        for (int ss = 0; ss < stacklen; ++ss)
+        {
+            for (int rr = 0; rr < kern_len; ++rr)
+            {
+                int img_rr = iR_logical - rr;
+                if ((img_rr >= 0) && (img_rr < img_len))
+                {
+                    for (int cc = 0; cc < kern_wid; ++cc)
+                    {
+                        int img_cc = iC_logical - cc;
+                        if ((img_cc >= 0) && (img_cc < img_wid))
+                        {
+                            float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
+                            float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
+                            sum +=  k_0 * i_0;
+                        }
+                    }
+                }
+            }
+        }
+        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
+    }
+}
+
+#endif // #ifndef CONV_KERNEL_CU
+/*
+  Local Variables:
+  mode:c++
+  c-basic-offset:4
+  c-file-style:"stroustrup"
+  indent-tabs-mode:nil
+  fill-column:79
+  End:
+*/
+// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -306,3 +306,101 @@ def local_gpua_shape(node):
            gpu_x, = x.owner.inputs
            return [gpu_shape(gpu_x)]
    return False
+
+
+@register_opt()
+@local_optimizer([])
+def local_gpu_conv(node):
+    """
+    gpu_from_host(conv) -> gpu_conv(gpu_from_host)
+
+    conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+    """
+    def GpuConvOp_from_ConvOp(op):
+        logical_img_hw = None
+
+        if op.kshp_logical is not None and op.kshp_logical != op.kshp:
+            return None
+        #print op.kshp, op.imshp[1:3]
+        #print op.kshp_logical, logical_img_hw
+        ret = GpuConv(border_mode=op.out_mode,
+                    subsample=(op.dx, op.dy),
+                    logical_img_hw=logical_img_hw,
+                    logical_kern_hw=op.kshp_logical,
+                    logical_kern_align_top=op.kshp_logical_top_aligned,
+                    kshp=op.kshp,
+                    version=op.version,
+                    verbose=op.verbose,
+                    imshp=op.imshp,
+                    )
+        if op.imshp_logical is not None:
+            logical_img_hw = op.imshp_logical[1:3]
+            if logical_img_hw != op.imshp[1:3]:
+                # this case is not implemented
+                #return None
+                rstride = int(numpy.ceil(op.imshp_logical[1] /
+                                         float(op.imshp[1])))
+                cstride = int(numpy.ceil(op.imshp_logical[2] /
+                                         float(op.imshp[2])))
+
+                def make_graph(img, kern):
+                    buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
+                                       img.shape[0], *op.imshp_logical)
+                    img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
+                                               img)
+                    img = gpu_from_host(img)
+                    return ret(img, kern)
+
+                return make_graph
+        return ret
+
+    def values_eq_approx(a, b):
+        """This fct is needed to don't have DebugMode raise useless
+        error due to ronding error.
+
+        This happen as We reduce on the two last dimensions, so this
+        can raise the absolute error if the number of element we
+        reduce on is significant.
+
+        """
+        assert a.ndim == 4
+        atol = None
+        if a.shape[-1] * a.shape[-2] > 100:
+            atol = 3e-5
+        return tensor.TensorType.values_eq_approx(a, b, atol=atol)
+
+    if node.op == gpu_from_host:
+        #gpu_from_host(conv) -> gpu_conv(gpu_from_host)
+        host_input = node.inputs[0]
+        if host_input.owner and isinstance(host_input.owner.op, conv.ConvOp):
+            gpu_conv = GpuConvOp_from_ConvOp(host_input.owner.op)
+            if gpu_conv is None:
+                return
+            img, kern = host_input.owner.inputs
+            out = gpu_conv(gpu_from_host(img),
+                           gpu_from_host(kern))
+            out = tensor.patternbroadcast(out,
+                                          node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx
+            # in some case the ConvOp broadcast the last 2 dimensions
+            # differently then the gpu ConvOp
+            return [out]
+
+    if isinstance(node.op, conv.ConvOp):
+        #conv(host_from_gpu) -> host_from_gpu(gpu_conv)
+        img, kern = node.inputs
+        img_on_gpu = (img.owner and img.owner.op == host_from_gpu)
+        kern_on_gpu = (kern.owner and kern.owner.op == host_from_gpu)
+        if img_on_gpu or kern_on_gpu:
+            gpu_conv = GpuConvOp_from_ConvOp(node.op)
+            if gpu_conv is None:
+                return
+            out = gpu_conv(gpu_from_host(img),
+                           gpu_from_host(kern))
+            out = tensor.patternbroadcast(
+                host_from_gpu(out),
+                node.outputs[0].broadcastable)
+            out.values_eq_approx = values_eq_approx
+            # in some case the ConvOp broadcast the last 2 dimensions
+            # differently then the gpu ConvOp
+            return [out]