Merge pull request #3737 from lamblin/gpuarray_abstractconv

Gpuarray abstractconv

Merge pull request #3737 from lamblin/gpuarray_abstractconv
6d4633be · Frédéric Bastien · 58cd0ac8 · ec381983 · 6d4633be · 6d4633be
--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -2406,14 +2406,14 @@ if True:
 @local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
                  AbstractConv2d_gradInputs])
 def local_abstractconv_cudnn(node):
-    inp1 = node.inputs[0]
-    inp2 = node.inputs[1]
-
    if (not isinstance(node.op, (AbstractConv2d,
                                 AbstractConv2d_gradWeights,
                                 AbstractConv2d_gradInputs))):
        return None

+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]
+
    if (not isinstance(inp1.type, CudaNdarrayType) or
            not isinstance(inp2.type, CudaNdarrayType)):
        return None

--- a/theano/sandbox/cuda/tests/test_abstractconv.py
+++ b/theano/sandbox/cuda/tests/test_abstractconv.py
@@ -237,124 +237,3 @@ class TestConv2d(unittest.TestCase):
                               verify_grad=True, mode=mode, device='gpu',
                               provide_shape=provide_shape, border_mode=b,
                               filter_flip=flip)
-
-    def test_cormm_conv(self):
-        if not dnn_available():
-            raise SkipTest(cuda.dnn.dnn_available.msg)
-
-        mode = mode_without_gpu
-        for (i, f), s, b, flip, provide_shape in itertools.product(
-                zip(self.inputs_shapes, self.filters_shapes),
-                self.subsamples,
-                self.border_modes,
-                self.filter_flip,
-                [False, True]):
-
-            o = self.get_output_shape(i, f, s, b)
-            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                         verify_grad=True, mode=mode, device='cpu',
-                         provide_shape=provide_shape, border_mode=b,
-                         filter_flip=flip)
-            self.run_gradweight(inputs_shape=i, filters_shape=f,
-                                output_shape=o, subsample=s,
-                                verify_grad=True, mode=mode, device='cpu',
-                                provide_shape=provide_shape, border_mode=b,
-                                filter_flip=flip)
-            self.run_gradinput(inputs_shape=i, filters_shape=f,
-                               output_shape=o, subsample=s,
-                               verify_grad=True, mode=mode, device='cpu',
-                               provide_shape=provide_shape, border_mode=b,
-                               filter_flip=flip)
-
-    def test_cpu_conv(self):
-        if not dnn_available():
-            raise SkipTest(cuda.dnn.dnn_available.msg)
-
-        mode = mode_without_gpu.excluding('conv_gemm')
-        for (i, f), s, b, flip, provide_shape in itertools.product(
-                zip(self.inputs_shapes, self.filters_shapes),
-                self.subsamples,
-                self.border_modes,
-                self.filter_flip,
-                [False, True]):
-
-            o = self.get_output_shape(i, f, s, b)
-            fwd_OK = True
-            gradweight_OK = True
-            gradinput_OK = True
-
-            if not flip:
-                fwd_OK = False
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if b not in ('valid', 'full'):
-                fwd_OK = False
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
-                gradweight_OK = False
-                gradinput_OK = False
-
-            if fwd_OK:
-                self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
-                             verify_grad=True, mode=mode, device='cpu',
-                             provide_shape=provide_shape, border_mode=b,
-                             filter_flip=flip)
-            else:
-                self.assertRaises(NotImplementedError,
-                                  self.run_fwd,
-                                  inputs_shape=i,
-                                  filters_shape=f,
-                                  subsample=s,
-                                  verify_grad=False,
-                                  mode=mode,
-                                  device='cpu',
-                                  provide_shape=provide_shape,
-                                  border_mode=b,
-                                  filter_flip=flip)
-
-            if gradweight_OK:
-                self.run_gradweight(inputs_shape=i, filters_shape=f,
-                                    output_shape=o, subsample=s,
-                                    verify_grad=False, mode=mode, device='cpu',
-                                    provide_shape=provide_shape, border_mode=b,
-                                    filter_flip=flip)
-            else:
-                self.assertRaises(NotImplementedError,
-                                  self.run_gradweight,
-                                  inputs_shape=i,
-                                  filters_shape=f,
-                                  output_shape=o,
-                                  subsample=s,
-                                  verify_grad=False,
-                                  mode=mode,
-                                  device='cpu',
-                                  provide_shape=provide_shape,
-                                  border_mode=b,
-                                  filter_flip=flip)
-
-            if gradinput_OK:
-                self.run_gradinput(inputs_shape=i, filters_shape=f,
-                                   output_shape=o, subsample=s,
-                                   verify_grad=False, mode=mode, device='cpu',
-                                   provide_shape=provide_shape, border_mode=b,
-                                   filter_flip=flip)
-            else:
-                self.assertRaises(NotImplementedError,
-                                  self.run_gradinput,
-                                  inputs_shape=i,
-                                  filters_shape=f,
-                                  output_shape=o,
-                                  subsample=s,
-                                  verify_grad=False,
-                                  mode=mode,
-                                  device='cpu',
-                                  provide_shape=provide_shape,
-                                  border_mode=b,
-                                  filter_flip=flip)
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
-//
-//TODO detect SHARED_SIZE dynamically
-#define SHARED_SIZE (16*1024)
-
-enum { ConvMode_FULL, ConvMode_VALID };
-PyObject * PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
-                           PyGpuArrayObject * out, const int mode,
-                           const size_t subsample_rows,
-                           const size_t subsample_cols,
-                           const int version, const int verbose);
-
-/*
- * version: -1, autodetect, >=0 a specific version to use.
- *          If it can't be executed, we revert to the reference implementation
- */
-int
-PyGpuArray_conv_valid(const PyGpuArrayObject *img,
-                      const PyGpuArrayObject * kern,
-                      PyGpuArrayObject * out, size_t subsample_rows,
-                      size_t subsample_cols,
-                      int version = -1, int verbose=0,
-                      int max_threads_dim0 = 512)
-{
-    int work_complete = 0;
-    const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
-    if (PyGpuArray_NDIM(img) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(kern) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(out) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    
-    if (verbose>1)
-    {
-        fprintf(stderr,
-                "INFO: Running conv_valid version=%d,"
-                " MACRO kern_width=%d with inputs:\n",
-                version, THEANO_KERN_WID);
-        fprintf(stderr,
-                "INFO:   img  dim: %llu %llu %llu %llu  "
-                "img  stride: %lld %lld %lld %lld\n",
-                (unsigned long long)PyGpuArray_DIMS(img)[0],
-                (unsigned long long)PyGpuArray_DIMS(img)[1],
-                (unsigned long long)PyGpuArray_DIMS(img)[2],
-                (unsigned long long)PyGpuArray_DIMS(img)[3],
-                (long long)PyGpuArray_STRIDES(img)[0]/4,
-                (long long)PyGpuArray_STRIDES(img)[1]/4,
-                (long long)PyGpuArray_STRIDES(img)[2]/4,
-                (long long)PyGpuArray_STRIDES(img)[3]/4);
-        fprintf(stderr,
-                "INFO:   kern dim: %llu %llu %llu %llu  "
-                "kern stride: %lld %lld %lld %lld\n",
-                (unsigned long long)PyGpuArray_DIMS(kern)[0],
-                (unsigned long long)PyGpuArray_DIMS(kern)[1],
-                (unsigned long long)PyGpuArray_DIMS(kern)[2],
-                (unsigned long long)PyGpuArray_DIMS(kern)[3],
-                (long long)PyGpuArray_STRIDES(kern)[0]/4,
-                (long long)PyGpuArray_STRIDES(kern)[1]/4,
-                (long long)PyGpuArray_STRIDES(kern)[2]/4,
-                (long long)PyGpuArray_STRIDES(kern)[3]/4);
-        fprintf(stderr,
-                "INFO:   out dim: %llu %llu %llu %llu  "
-                "out stride: %lld %lld %lld %lld\n",
-                (unsigned long long)PyGpuArray_DIMS(out)[0],
-                (unsigned long long)PyGpuArray_DIMS(out)[1],
-                (unsigned long long)PyGpuArray_DIMS(out)[2],
-                (unsigned long long)PyGpuArray_DIMS(out)[3],
-                (long long)PyGpuArray_STRIDES(out)[0]/4,
-                (long long)PyGpuArray_STRIDES(out)[1]/4,
-                (long long)PyGpuArray_STRIDES(out)[2]/4,
-                (long long)PyGpuArray_STRIDES(out)[3]/4);
-        fprintf(stderr,
-                "INFO:   subsample_rows=%llu, subsample_cols=%llu\n",
-                (unsigned long long)subsample_rows,
-                (unsigned long long)subsample_cols);
-    }
-
-    //Check the output size is valid
-    if (!(PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2]- PyGpuArray_DIMS(kern)[2] + 1, subsample_rows) ||
-          PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3]- PyGpuArray_DIMS(kern)[3] + 1, subsample_cols) ||
-          PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0] ||
-          PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0] ||
-          PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1])) {
-        PyErr_SetString(PyExc_ValueError, "GpuConv: sizes don't match");
-        return -1;
-    }
-
-    // we now search through a few implementations until one applies to our arguments.
-
-    //TODO: make separate version as if all fill this is slower.
-    //TODO: Make a switch with power of 2 max size as template
-    //TODO: make a parameter the number of division
-    //TODO: Should we make them in separate grid block instead?
- 
-    const int stack_len = PyGpuArray_DIMS(img)[1];
-    const int nstack=PyGpuArray_DIMS(kern)[1];
-    const int nbatch=PyGpuArray_DIMS(img)[0];
-    const int nkern=PyGpuArray_DIMS(kern)[0];
-    const int img_wid=PyGpuArray_DIMS(img)[3];
-    const int img_len=PyGpuArray_DIMS(img)[2];
-    const int kern_wid=PyGpuArray_DIMS(kern)[3];
-    const int kern_len=PyGpuArray_DIMS(kern)[2];
-    const int out_wid=PyGpuArray_DIMS(out)[3];
-    const int out_len=PyGpuArray_DIMS(out)[2];
-
-    const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
-    const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
-    const int img_stride_stack= PyGpuArray_STRIDES(img)[1]/4;
-    const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
-    const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
-    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
-    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
-    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
-    const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
-    const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
-    const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
-    const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
-
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    const int out_size_byte = out_size*sizeof(float);
-    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
-     PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received had %llu columns!",
-                   THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
-      return -1;
-    }
-
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-    bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
-    bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
-    bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the
-    // stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when
-    //we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-    }
-
-    //if we remove the restriction
-    //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
-    //we will lower the occupency due to shared memory and/or registers.
-    if ((version == -1) &&
-        (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
-        out_size<=256){
-      //condition for exec 
-      if(!subsample &&
-        out_contiguous &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-         std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
-        !work_complete)
-        version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
-    }
-
-    if (!subsample && c_contiguous &&
-        (version==0||version==2||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads for block.x
-        nstack == 1 &&// don't implement the stack in the kernel.
-        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_patch
-    {
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
-        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0)
-            nb_split++;
-        size_t threads_per_block[3] = {(size_t)out_wid,
-                               ceil_intdiv((size_t)out_len,(size_t)nb_split),
-                               (size_t)1};
-        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
-
-        size_t shmem_sz = (img_size + kern_size)*sizeof(float);
-
-        GpuKernel *k = NULL;
-        if(threads_per_block[1]==out_len) k=&conv_patch_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else k=&conv_patch_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
-
-        void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
-                                 (void *)kern->ga.data, (void *)&kern->ga.offset,
-                                 (void *)out->ga.data, (void *)&out->ga.offset,
-                                 (void *)&img_len, (void *)&img_wid,
-                                 (void *)&kern_len, (void *)&kern_wid,
-                                 (void *)&nkern, (void *)&nstack};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_patch' version %s nb_split=%d\n",
-                      threads_per_block[1]==out_len ? "no split": "split", nb_split);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_patch' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }
-    }
-
-    if (out_contiguous &&
-        (version==1||version==3||version==11||version==12||version==-1) &&
-        (version!=1 || out_size<=max_threads_dim0) &&//Maximum of X threads by block.x
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_patch_stack
-    {
-      //version 1 is without split and preload the full kernel
-      //version 3 is with split and preload the full kernel
-      //version 11 is without split and load only 1 kernel row at a time.
-      //version 12 is with split and load only 1 kernel row at a time.
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
-        //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_threads_dim0) nb_split++;
-        size_t threads_per_block[3] = {(size_t)out_wid,
-                               (size_t)ceil_intdiv(out_len,nb_split),
-                               (size_t)1};
-
-        bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
-        if(version==11 || version==12) preload_full_kernel=false;
-        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
-        size_t shmem_sz = (img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
-
-        GpuKernel *k = NULL;
-        if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_64_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_65_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_66_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_67_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_68_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_69_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_70_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_71_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_72_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_73_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_74_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_75_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_76_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_77_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_78_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_79_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_80_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_81_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_82_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_83_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_84_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_85_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ k=&conv_patch_stack_86_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ k=&conv_patch_stack_87_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_88_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_89_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_90_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_91_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_92_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_93_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ k=&conv_patch_stack_94_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ k=&conv_patch_stack_95_node_<<<<HASH_PLACEHOLDER>>>>_0;}
-
-        void *kernel_params[] = {(void *)img->ga.data, (void *)&img->ga.offset,
-                                 (void *)kern->ga.data, (void *)&kern->ga.offset,
-                                 (void *)out->ga.data, (void *)&out->ga.offset,
-                                 (void *)&img_len, (void *)&img_wid,
-                                 (void *)&kern_len, (void *)&kern_wid,
-                                 (void *)&out_len, (void *)&out_wid,
-                                 (void *)&nkern, (void *)&nstack,
-                                 (void *)&img_stride_col, (void *)&img_stride_row,
-                                 (void *)&img_stride_stack, (void *)&img_stride_batch,
-                                 (void *)&kern_stride_col, (void *)&kern_stride_row,
-                                 (void *)&kern_stride_stack, (void *)&kern_stride_nkern,
-                                 (void *)&subsample_rows, (void *)&subsample_cols};
-
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_patch_stack' version with nb_split=%i"
-                      " and preload_full_kernel=%i,"
-                      " subsample_rows=%llu, subsample_cols=%llu\n",
-                      nb_split, preload_full_kernel,
-                      (unsigned long long)subsample_rows,
-                      (unsigned long long)subsample_cols);
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_patch_stack' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }
-    }
-
-    if (!subsample && out_contiguous &&
-        (version==4||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        nstack == 1 &&// don't implement the stack in the kernel.
-        kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
-        !work_complete) //conv_rows
-
-    {
-        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)1, (size_t)1};
-        size_t n_blocks[3] = {(size_t)out_len, (size_t)nbatch*nkern, (size_t)1};
-        size_t shmem_sz = (kern_len*img_wid + kern_size)*sizeof(float);
-
-        GpuKernel *k = NULL;
-        if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else k=&conv_rows_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack,
-            (void *)&img_stride_col, (void *)&img_stride_row,
-            (void *)&img_stride_stack, (void *)&img_stride_batch,
-            (void *)&kern_stride_col, (void *)&kern_stride_row,
-            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_rows' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }
-    }
-    if (!subsample && out_contiguous &&
-        (version==5||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_rows_stack
-
-    {
-        int nb_row=1;
-        //TODO:if not c_contiguous, lower max_thread as we use 22
-        //registers by thread and we won't execute 2 block in one MP.
-        for(int i=2;i<=out_len;i++){
-          if((i)*out_wid<=max_threads_dim0 && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
-            nb_row=i;
-        }
-
-        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
-        size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
-                              (size_t)nbatch*nkern, (size_t)1};
-
-        size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
-
-        GpuKernel *k = NULL;
-        if(!img_contiguous_2d || !kern_contiguous_2d) {
-            k=&conv_rows_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        } else {
-            k=&conv_rows_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        }
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack,
-            (void *)&img_stride_col, (void *)&img_stride_row,
-            (void *)&img_stride_stack, (void *)&img_stride_batch,
-            (void *)&kern_stride_col, (void *)&kern_stride_row,
-            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows_stack' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }
-    }
-
-    if (!subsample && out_contiguous &&
-        (version==9||version==10||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
-        (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
-        !work_complete) //conv_rows_stack2
-
-    {
-      // version 9:we preload the full kernel
-      // version 10: load only a few row at a time.
-        int nb_row=1;
-        int version_back = version;
-        //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
-        if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
-          version = 9;
-        else if(version==-1)version = 10;
-
-        int k_size = kern_size;
-        if(version==10)
-          k_size=kern_wid;
-
-        for(int i=2;i<=out_len;i++){
-          if(i*out_wid<=max_threads_dim0 && (i*img_wid + k_size)*sizeof(float)<shared_avail)
-            nb_row=i;
-        }
-
-        //to test the case when we don't have a thread by output pixel.
-        if((version_back!=-1)&& nb_row>1) nb_row--;
-
-
-        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)nb_row, (size_t)1};
-        size_t n_blocks[3] = {(size_t)ceil_intdiv(out_len,nb_row),
-                              (size_t)nbatch*nkern, (size_t)1};
-          
-        size_t shmem_sz =((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
-
-        GpuKernel *k = NULL;
-        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) k=&conv_rows_stack2_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(version==9) k=&conv_rows_stack2_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(!img_contiguous_2d || !kern_contiguous_2d) k=&conv_rows_stack2_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else k=&conv_rows_stack2_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack,
-            (void *)&img_stride_col, (void *)&img_stride_row,
-            (void *)&img_stride_stack, (void *)&img_stride_batch,
-            (void *)&kern_stride_col, (void *)&kern_stride_row,
-            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_rows_stack2' version %s with"
-                      " %d row(s).\n",
-                      (version==9?"'load full kernel'":
-                       "'load 1 kern row at a time'"),nb_row);
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_rows_stack2' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }
-    }
-
-    //version 8 is the same but we force the split.
-    // The split is need in case we have too much threads.
-    // This happen frequently if the kernel length is big.
-    // Big kernel is frequent in the gradient.
-    //version 8 need a minimum of kernel length as we force the split.
-    //version 8 is needed to test more easily this kernel template parameter.
-    //version 13 load only 1 kernel row at a time.
-    if (!subsample &&
-        out_contiguous &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        (version==7||version==8||version==13||version==-1) &&
-        (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
-        //version 13 need a minimal kernel length as big as the split.
-        (version!=13||kern_len>1) &&
-        !work_complete) //conv_patch_stack_reduce
-    {
-        int nb_split=1;
-        int full_kern=true;
-
-        if(version==8||version==13) nb_split++;//force the split.
-        if(version==13)full_kern=false;
-
-        //check if we can fit the full kernel in the shared memory
-        if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
-          full_kern = false;
-        }
-
-        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
-        // we need enough splits so that
-        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
-        // b) thread_z * out_len * out_wid fits in the thread count
-        // c) the kernel doesn't need too much shared memory
-
-        // constraint (a)
-        // device 1.3 have a max of 64 thread in z
-        while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
-
-        // constraint (b)
-        //  (TODO: read the number of threads per block from the device)
-        while(out_size*ceil_intdiv(kern_len,nb_split)>max_threads_dim0)
-            nb_split++;
-
-        // tentative estimates (prior to contraint c)
-        size_t thread_z=ceil_intdiv(kern_len,nb_split);
-        size_t shmem_sz = sizeof(float)*(full_kern
-                ? std::max((size_t)img_size + kern_size, out_size*thread_z)
-                : std::max((size_t)img_size + thread_z*kern_wid, out_size*thread_z));
-
-        // constraint (c)
-        while ((shmem_sz >= shared_avail) && (nb_split <= kern_len)){
-            //if we can't fit the kernel in shared memory, we must split it more.
-            nb_split++;
-            thread_z=ceil_intdiv(kern_len,nb_split);
-            shmem_sz = sizeof(float)*(full_kern
-                ? std::max((size_t)img_size + kern_size, out_size*thread_z)
-                : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
-        }
-        if (nb_split <= kern_len)
-        {
-            assert(thread_z>0);//should not happen, but in case...
-            if(!full_kern) assert(thread_z!=kern_len);
-
-            size_t threads_per_block[3] = {(size_t)out_wid,
-                                   (size_t)out_len,
-                                   (size_t)thread_z};
-            size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
-
-            GpuKernel *k = NULL;
-
-            const bool split=thread_z!=kern_len;
-            const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
-
-            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
-            //We will always be split when we don't load the full kernel
-
-            /* if(!kern_flipped && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
-            /*else*/ if(!kern_flipped && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(!kern_flipped && !ccontig  && split && !full_kern) k=&conv_patch_stack_reduce_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(!kern_flipped && !ccontig  && split && full_kern) k=&conv_patch_stack_reduce_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            /*else if(!kern_flipped && ccontig  && !split && !full_kern) k=&conv_patch_stack_reduce_4_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
-            else if(!kern_flipped && ccontig  && !split && full_kern) k=&conv_patch_stack_reduce_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(!kern_flipped && ccontig  && split && !full_kern) k=&conv_patch_stack_reduce_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(!kern_flipped && ccontig  && split && full_kern) k=&conv_patch_stack_reduce_7_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            /*else if(kern_flipped  && !ccontig && !split && !full_kern) k=&conv_patch_stack_reduce_8_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
-            else if(kern_flipped  && !ccontig && !split && full_kern) k=&conv_patch_stack_reduce_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(kern_flipped  && !ccontig && split && !full_kern) k=&conv_patch_stack_reduce_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(kern_flipped  && !ccontig && split && full_kern) k=&conv_patch_stack_reduce_11_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            /*else if(kern_flipped  && ccontig  && !split && !full_kern) k=&conv_patch_stack_reduce_12_node_<<<<HASH_PLACEHOLDER>>>>_0;*/
-            else if(kern_flipped  && ccontig  && !split && full_kern) k=&conv_patch_stack_reduce_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(kern_flipped  && ccontig  && split && !full_kern) k=&conv_patch_stack_reduce_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
-            else if(kern_flipped  && ccontig  && split && full_kern) k=&conv_patch_stack_reduce_15_node_<<<<HASH_PLACEHOLDER>>>>_0;
-
-            void *kernel_params[] = {
-                (void *)img->ga.data, (void *)&img->ga.offset,
-                (void *)kern->ga.data, (void *)&kern->ga.offset,
-                (void *)out->ga.data, (void *)&out->ga.offset,
-                (void *)&img_len, (void *)&img_wid,
-                (void *)&kern_len, (void *)&kern_wid,
-                (void *)&nkern, (void *)&nstack,
-                (void *)&img_stride_col, (void *)&img_stride_row,
-                (void *)&img_stride_stack, (void *)&img_stride_batch,
-                (void *)&kern_stride_col,
-                (void *)&kern_stride_row,
-                (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-            int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-            if (err == GA_NO_ERROR)
-            {
-                if (verbose)
-                    fprintf(stderr,
-                            "INFO: used 'conv_patch_stack_reduce' version"
-                            " kern_flipped=%i ccontig=%i nb_split=%d,"
-                            " preload_full_kern=%d\n",
-                            kern_flipped, ccontig, nb_split, full_kern);
-                work_complete = true;
-            }
-            else
-            {
-                if (verbose)
-                  fprintf(stderr,
-                          "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
-                          " trying next implementation\n",
-                          GpuKernel_error(k, err));
-            }
-        } // else no good nb_splits was found
-    }
-
-    if ((version==6||version==-1) &&
-        kern_len<=320 &&
-        !work_complete) //conv_valid_row_reduce
-    {
-        size_t outsize = PyGpuArray_SIZE(out);
-        size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
-                              (size_t)1, (size_t)1};
-
-        int block_nstack=nstack;
-        //Max of 512 threads per blocks.
-        //On old hardware, we have a max of 356 threads as we have only 
-        //8k registers and the kernel use 23 register
-        //TODO: check if we have 8k or 16k of register...
-        while(block_nstack*kern_len>320)block_nstack--;
-        size_t threads_per_block[3] = {(size_t)block_nstack, (size_t)kern_len, (size_t)1};
-
-        size_t n_reduce_buf = block_nstack * kern_len * sizeof(float);
-        /* initial_reduce_boundary is the greatest power of two less than n_reduce_buf/ sizeof(float)
-         *
-         * if n_reduce_buf == sizeof(float), then initial_reduce_boundary == 0.
-         * */
-        int initial_reduce_boundary = (1 << (int)(log2((double)(n_reduce_buf/sizeof(float)))));
-        if (initial_reduce_boundary == (n_reduce_buf / sizeof(float)))
-            initial_reduce_boundary >>= 1;
-
-        if (n_reduce_buf == sizeof(float))
-            assert (initial_reduce_boundary == 0);
-        else
-        {
-            assert (initial_reduce_boundary * 2 >= n_reduce_buf/sizeof(float));
-            assert (initial_reduce_boundary < n_reduce_buf/sizeof(float));
-        }
-
-        GpuKernel *k = NULL;
-        //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
-        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
-        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
-        if(block_nstack==nstack)
-          k=&conv_valid_row_reduce_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else
-          k=&conv_valid_row_reduce_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-
-        void *kernel_params[] = {
-            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&out_len, (void *)&out_wid,
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)&img_stride_batch, (void *)&img_stride_stack,
-            (void *)&img_stride_row, (void *)&img_stride_col,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
-            (void *)&kern_stride_row, (void *)&kern_stride_col,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&out_stride_batch, (void *)&out_stride_nkern,
-            (void *)&out_stride_row, (void *)&out_stride_col,
-            (void *)&subsample_rows, (void *)&subsample_cols,
-            (void *)&initial_reduce_boundary};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, n_reduce_buf, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_valid_row_reduce' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }
-    }
-
-    if (1 && !work_complete) //conv_reference_valid
-    {
-        size_t outsize = PyGpuArray_SIZE(out);
-        size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
-                              (size_t)1, (size_t)1};
-        size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
-                                        (size_t)256),
-                               (size_t)1, (size_t)1};
-
-        if (verbose)
-            fprintf(stderr, "INFO: launching conv_reference_valid\n");
-
-        void *kernel_params[] = {
-            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&out_len, (void *)&out_wid,
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)&img_stride_batch, (void *)&img_stride_stack,
-            (void *)&img_stride_row, (void *)&img_stride_col,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
-            (void *)&kern_stride_row, (void *)&kern_stride_col,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&out_stride_batch, (void *)&out_stride_nkern,
-            (void *)&out_stride_row, (void *)&out_stride_col,
-            (void *)&subsample_rows, (void *)&subsample_cols};
-        int err = GpuKernel_call(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0,
-                                 3, threads_per_block, n_blocks, 0, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            work_complete = true;
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: 'conv_reference_valid' failed\n");
-            PyErr_Format(PyExc_RuntimeError,
-                         "ERROR: all implementations failed for"
-                         " PyGpuArray_conv_valid! (%s)",
-                         GpuKernel_error(&conv_reference_valid_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
-            return -1;
-        }
-    }
-    if (!work_complete)
-    {
-      PyErr_Format(PyExc_RuntimeError,
-                   "ERROR: no implementation(s) worked for"
-                   " PyGpuArray_conv_valid!"
-                   " Version asked(%d) (-1 mean use an heuristic)",
-                   version);
-        return -1;
-    }
-    return 0;
-}
-
-int
-PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
-                      PyGpuArrayObject * out, size_t subsample_rows,
-                      size_t subsample_cols, int version = -1, int verbose=0,
-                      int max_threads_dim0=512)
-{
-  //144 is the biggest static shared size used with compiling this file.
-    const int shared_avail = SHARED_SIZE - 150;
-
-    int work_complete = 0;
-    if (PyGpuArray_NDIM(img) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required img of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(kern) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required kern of 4D");
-        return -1;
-    }
-    if (PyGpuArray_NDIM(out) != 4)
-    {
-        PyErr_SetString(PyExc_ValueError, "required out of 4D");
-        return -1;
-    }
-    // check the size of the output matrix
-    assert (PyGpuArray_DIMS(out)[2] == ceil_intdiv(PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1, subsample_rows));
-    assert (PyGpuArray_DIMS(out)[3] == ceil_intdiv(PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1, subsample_cols));
-
-    assert (PyGpuArray_DIMS(out)[0] == PyGpuArray_DIMS(img)[0]);
-    assert (PyGpuArray_DIMS(out)[1] == PyGpuArray_DIMS(kern)[0]);
-    assert (PyGpuArray_DIMS(img)[1] == PyGpuArray_DIMS(kern)[1]);
-
-    const int stack_len=PyGpuArray_DIMS(img)[1];
-    const int nstack=PyGpuArray_DIMS(kern)[1];
-    const int nbatch=PyGpuArray_DIMS(img)[0];
-    const int nkern=PyGpuArray_DIMS(kern)[0];
-    const int img_wid=PyGpuArray_DIMS(img)[3];
-    const int img_len=PyGpuArray_DIMS(img)[2];
-    const int kern_wid=PyGpuArray_DIMS(kern)[3];
-    const int kern_len=PyGpuArray_DIMS(kern)[2];
-    const int out_wid=PyGpuArray_DIMS(out)[3];
-    const int out_len=PyGpuArray_DIMS(out)[2];
-
-    const int img_stride_col= PyGpuArray_STRIDES(img)[3]/4;
-    const int img_stride_row=PyGpuArray_STRIDES(img)[2]/4;
-    const int img_stride_stack=PyGpuArray_STRIDES(img)[1]/4;
-    const int img_stride_batch=PyGpuArray_STRIDES(img)[0]/4;
-    const int kern_stride_col= PyGpuArray_STRIDES(kern)[3]/4;
-    const int kern_stride_row=PyGpuArray_STRIDES(kern)[2]/4;
-    const int kern_stride_stack= PyGpuArray_STRIDES(kern)[1]/4;
-    const int kern_stride_nkern=PyGpuArray_STRIDES(kern)[0]/4;
-    const int out_stride_col = PyGpuArray_STRIDES(out)[3]/4;
-    const int out_stride_row = PyGpuArray_STRIDES(out)[2]/4;
-    const int out_stride_nkern = PyGpuArray_STRIDES(out)[1]/4;
-    const int out_stride_batch = PyGpuArray_STRIDES(out)[0]/4;
-
-    const int img_size=img_len*img_wid;
-    const int kern_size=kern_len*kern_wid;
-    const int out_size=out_len*out_wid;
-    const int img_size_byte = img_size*sizeof(float);
-    const int kern_size_byte = kern_size*sizeof(float);
-    //padded image sizes
-    const int img_wid_padded=img_wid+2*kern_wid-2;
-    const int img_len_padded=img_len+2*kern_len-2;
-    const int img_size_padded=img_len_padded * img_wid_padded;
-    const int img_size_padded_byte = img_size_padded*sizeof(float);
-    
-    //const int out_size_byte = out_size*sizeof(float); // unused 
-
-    if (!((THEANO_KERN_WID == PyGpuArray_DIMS(kern)[3]) ||
-          (THEANO_KERN_WID == 0))){
-      PyErr_Format(PyExc_ValueError,
-                   "ERROR: This GpuConv code was compiled for"
-                   " %d kernel columns, but the kernel we received"
-                   " had %llud columns!",
-                   THEANO_KERN_WID, (unsigned long long)PyGpuArray_DIMS(kern)[3]);
-      return -1;
-    }
-    bool subsample = subsample_rows!=1 || subsample_cols!=1;
-
-    bool img_contiguous = img->ga.flags & GA_C_CONTIGUOUS;
-    bool kern_contiguous = kern->ga.flags & GA_C_CONTIGUOUS;
-    bool out_contiguous = out->ga.flags & GA_C_CONTIGUOUS;
-    bool c_contiguous = img_contiguous &&  kern_contiguous && out_contiguous;
-
-    bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
-    bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
-
-    bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
-
-    //if the lower 2 dims are c_contiguous but flipped, unflipping the
-    //stride and not flipping the kernel in shared memroy
-    //allow to use a version that use less registers(so is faster)
-    //the unflipped version of variable have the original value when
-    //we don't need to unflip it, but have the new value when we unflip it.
-    bool kern_flipped=true;
-    bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
-    if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
-      //the last two dimensions are c_contiguous but flipped!
-      kern_flipped=false;
-      kern_contiguous_2d_unflipped = true;
-    }
-
-    if (verbose>1)
-    {
-        printf("INFO: Running conv_full version=%d,"
-               " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
-        printf("INFO:   img  dim: %llu %llu %llu %llu  "
-               "img  stride: %lld %lld %lld %lld\n",
-               (unsigned long long)nbatch,
-               (unsigned long long)stack_len,
-               (unsigned long long)img_len,
-               (unsigned long long)img_wid,
-               (long long)img_stride_batch,
-               (long long)img_stride_stack,
-               (long long)img_stride_row,
-               (long long)img_stride_col);
-        printf("INFO:   kern dim: %llu %llu %llu %llu  "
-               "kern stride: %lld %lld %lld %lld\n",
-               (unsigned long long)nkern,
-               (unsigned long long)nstack,
-               (unsigned long long)kern_len,
-               (unsigned long long)kern_wid,
-               (long long)kern_stride_nkern,
-               (long long)kern_stride_stack,
-               (long long)kern_stride_row,
-               (long long)kern_stride_col);
-        printf("INFO:   out dim: %llu %llu %llu %llu  "
-               "out stride: %lld %lld %lld %lld\n",
-               (unsigned long long)PyGpuArray_DIMS(out)[0],
-               (unsigned long long)PyGpuArray_DIMS(out)[1],
-               (unsigned long long)out_len,
-               (unsigned long long)out_wid,
-               (long long)out_stride_batch,
-               (long long)out_stride_nkern,
-               (long long)out_stride_row,
-               (long long)out_stride_col);
-    }
-
-    if (!subsample &&
-        out_contiguous &&
-        (version==3||version==4||version==5||version==-1) &&
-        out_wid<=max_threads_dim0 &&//Maximum of X threads by block.x
-        (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_full_patch_stack_padded
-    {
-      //version 3 without split
-      //version 4 with split (more registers)
-      //version 5 with split (more registers) low mem version(some restriction and still more register)
-        int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
-        if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
-        if(kern_len==1 && version==5){
-          //version 5 don't support kern_len==1 as 1%0 return -1.
-          version=-1;
-          if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
-        }
-        if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
-
-        //we pass by ceil_intdiv in case the out_len is not a multiple
-        //of nb_split, we want nb_split the number of iteration.
-        //Max of 16k of shared memory
-        if(version==5)
-          while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
-        
-        //327 as we use 25 register
-        //version 5 will have only 1 block running at a time, so we
-        //can use 32 registers per threads, but their is some other stuff that
-        //for the limit to bu lower then 512.
-        int max_thread = (version!=5?327:450);
-        while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
-        if(version==-1 && out_size>max_threads_dim0)version=4;
-        if(version==-1)version=3;
-
-
-        if(version==-1 && nb_split>1) version=4;
-        else if(version==-1) version=3;
-        //force version 4 when more than 1 split are needed to always execute.
-        else if(version==3 && nb_split!=1) version=4;
-
-        assert(version!=3 || nb_split==1);
-        assert(version!=5 || kern_len>1);
-        assert(version!=-1);
-
-        size_t threads_per_block[3] = {(size_t)out_wid,
-                               ceil_intdiv((size_t)out_len,(size_t)nb_split),
-                               (size_t)1};
-        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
-
-        size_t shmem_sz=img_size_padded_byte + kern_size_byte;
-        if(version==5)
-          shmem_sz=((kern_len+threads_per_block[1]-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
-
-        GpuKernel *k = NULL;
-        if(version==3) k=&conv_full_patch_stack_padded_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(version==5) k=&conv_full_patch_stack_padded_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(version==4) k=&conv_full_patch_stack_padded_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) k=&conv_full_patch_stack_padded_4_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) k=&conv_full_patch_stack_padded_5_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) k=&conv_full_patch_stack_padded_6_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(version==3 && kern_flipped) k=&conv_full_patch_stack_padded_8_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(version==5 && kern_flipped)k=&conv_full_patch_stack_padded_9_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(version==4 && kern_flipped)k=&conv_full_patch_stack_padded_10_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) k=&conv_full_patch_stack_padded_12_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) k=&conv_full_patch_stack_padded_13_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) k=&conv_full_patch_stack_padded_14_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else assert(false);
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack,
-            (void *)&img_stride_col, (void *)&img_stride_row,
-            (void *)&img_stride_stack, (void *)&img_stride_batch,
-            (void *)&kern_stride_col, (void *)&kern_stride_row,
-            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: used 'conv_full_patch_stack_padded'"
-                      " nb_split=%d low_mem=%s\n",
-                      nb_split, (version==5?"true":"false"));
-            work_complete = true;
-        }
-        else
-        {
-          if (verbose)
-            fprintf(stderr,
-                    "INFO: impl 'conv_full_patch_stack_padded' %s %s"
-                    " failed (%s), trying next implementation\n",
-                    version==3?"no split": "split",
-                    (version==5?"low_mem":"not_low_mem"),
-                    GpuKernel_error(k, err));
-        }                         
-    }
-
-    if (!subsample && c_contiguous &&
-        (version==0||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        nstack == 1 &&// don't implement the stack in the kernel.
-        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_full_patch
-    {
-        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
-        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
-        size_t shmem_sz = (img_size + kern_size)*sizeof(float);
-        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack};
-        int err = GpuKernel_call(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0,
-                                 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr,
-                      "INFO: impl 'conv_full_patch' failed (%s),"
-                      " trying next implementation\n",
-                      GpuKernel_error(&conv_full_patch_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
-        }                         
-    }
-    if (false && !subsample && //disabled as test fail for this kernel
-        (version==1||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        (nbatch > 20 || version==1) &&  // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
-        nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
-        !work_complete) //conv_full_load_everything
-    {
-        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
-        size_t n_blocks[3] = {(size_t)nbatch, (size_t)1, (size_t)1};
-        size_t shmem_sz = (img_size + kern_size)*nstack*sizeof(float);
-        //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack,
-            (void *)&img_stride_col, (void *)&img_stride_row,
-            (void *)&img_stride_stack, (void *)&img_stride_batch,
-            (void *)&kern_stride_col, (void *)&kern_stride_row,
-            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-        int err = GpuKernel_call(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0,
-                                 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose) fprintf(stderr, "INFO: used 'conv_full_load_everything' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
-                      " failed (%s), trying next implementation\n",
-                      GpuKernel_error(&conv_full_load_everything_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
-        }
-    }
-
-    if (!subsample &&
-        img_batch_stack_contiguous &&
-        out_contiguous &&
-        (version==2||version==-1) &&
-        out_size<=max_threads_dim0 &&//Maximum of X threads by block
-        img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
-        !work_complete) //conv_full_patch_stack
-    {
-        size_t threads_per_block[3] = {(size_t)out_wid, (size_t)out_len, (size_t)1};
-        size_t n_blocks[3] = {(size_t)nbatch, (size_t)nkern, (size_t)1};
-        size_t shmem_sz = (img_size + kern_size)*sizeof(float);
-
-        GpuKernel *k = NULL;
-        if(!img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_0_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(!img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_1_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && !kern_contiguous_2d) k=&conv_full_patch_stack_2_node_<<<<HASH_PLACEHOLDER>>>>_0;
-        else if(img_contiguous_2d && kern_contiguous_2d) k=&conv_full_patch_stack_3_node_<<<<HASH_PLACEHOLDER>>>>_0;
-
-        void *kernel_params[] = {
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&nkern, (void *)&nstack,
-            (void *)&img_stride_col, (void *)&img_stride_row,
-            (void *)&kern_stride_col, (void *)&kern_stride_row,
-            (void *)&kern_stride_stack, (void *)&kern_stride_nkern};
-        int err = GpuKernel_call(k, 3, threads_per_block, n_blocks, shmem_sz, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
-            work_complete = true;
-        }
-        else
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
-                      GpuKernel_error(k, err));
-        }                         
-    }
-    if (1 && !work_complete) //conv_reference_full
-    {
-        if(verbose>1) fprintf(stderr, "INFO: will start conv_reference_full\n");
-
-        size_t outsize = PyGpuArray_SIZE(out);
-        size_t n_blocks[3] = {std::min(outsize, (size_t)4096),
-                              (size_t)1, (size_t)1};
-        size_t threads_per_block[3] = {std::min(ceil_intdiv(outsize, n_blocks[0]),
-                                        (size_t)256),
-                               (size_t)1, (size_t)1};
-
-        void *kernel_params[] = {
-            (void *)&nbatch, (void *)&nkern, (void *)&stack_len,
-            (void *)&img_len, (void *)&img_wid,
-            (void *)&kern_len, (void *)&kern_wid,
-            (void *)&out_len, (void *)&out_wid,
-            (void *)img->ga.data, (void *)&img->ga.offset,
-            (void *)&img_stride_batch, (void *)&img_stride_stack,
-            (void *)&img_stride_row, (void *)&img_stride_col,
-            (void *)kern->ga.data, (void *)&kern->ga.offset,
-            (void *)&kern_stride_nkern, (void *)&kern_stride_stack,
-            (void *)&kern_stride_row, (void *)&kern_stride_col,
-            (void *)out->ga.data, (void *)&out->ga.offset,
-            (void *)&out_stride_batch, (void *)&out_stride_nkern,
-            (void *)&out_stride_row, (void *)&out_stride_col,
-            (void *)&subsample_rows, (void *)&subsample_cols};
-        int err = GpuKernel_call(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0,
-                                 3, threads_per_block, n_blocks, 0, kernel_params);
-
-        if (err == GA_NO_ERROR)
-        {
-            if (verbose)
-              fprintf(stderr, "INFO: used 'conv_reference_full' version"
-                      " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
-                      " nkern=%d nstack=%d subsample=%d\n",
-                      img_len,img_wid, kern_len, kern_wid,
-                      out_len, out_wid, nbatch, nkern, nstack, subsample);
-            work_complete = true;
-        }
-        else
-        {
-          if (verbose)
-            fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
-                    " trying next implementation\n",
-                    GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
-          PyErr_Format(PyExc_RuntimeError,
-                       "ERROR: all implementations failed for"
-                       " CudaNdarray_conv_full! (%s)",
-                       GpuKernel_error(&conv_reference_full_node_<<<<HASH_PLACEHOLDER>>>>_0, err));
-          return -1;
-        }
-    }
-    return 0;
-}
-
-PyObject *
-PyGpuArray_Conv(PyGpuArrayObject *img, PyGpuArrayObject * kern,
-                 PyGpuArrayObject * out, const int mode,
-                 const size_t subsample_rows, const size_t subsample_cols,
-                 const int version, const int verbose,
-                 const int max_threads_dim0 = 512
-                 )
-{
-    // Re-use the out object if possible.  If the out object it not used, then its refcount is not modified.
-    //  If the out object is re-used then it is returned, and its refcount is incremented by 1.
-    //
-    if (PyGpuArray_NDIM(img) != 4)
-    {
-      PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
-      return NULL;
-    }
-    if (PyGpuArray_NDIM(kern) != 4)
-    {
-      PyErr_SetString(PyExc_ValueError, "PyGpuArray 4-D tensor required");
-      return NULL;
-    }
-
-    size_t out_dim[4];
-    out_dim[0] = PyGpuArray_DIMS(img)[0];
-    out_dim[1] = PyGpuArray_DIMS(kern)[0];
-    size_t logical_rows, logical_cols;
-    if (mode == ConvMode_VALID)
-    {
-        logical_rows = PyGpuArray_DIMS(img)[2] - PyGpuArray_DIMS(kern)[2] + 1;
-        logical_cols = PyGpuArray_DIMS(img)[3] - PyGpuArray_DIMS(kern)[3] + 1;
-    }
-    else
-    {
-        logical_rows = PyGpuArray_DIMS(img)[2] + PyGpuArray_DIMS(kern)[2] - 1;
-        logical_cols = PyGpuArray_DIMS(img)[3] + PyGpuArray_DIMS(kern)[3] - 1;
-    }
-    out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
-    out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
-
-    PyGpuArrayObject * rval = NULL;
-
-    if ( out
-         && PyGpuArray_NDIM(out)==4
-         && out->ga.flags & GA_C_CONTIGUOUS
-         && PyGpuArray_DIMS(out)[0]==out_dim[0]
-         && PyGpuArray_DIMS(out)[1]==out_dim[1]
-         && PyGpuArray_DIMS(out)[2]==out_dim[2]
-         && PyGpuArray_DIMS(out)[3]==out_dim[3])
-    {
-      rval = out;
-      Py_INCREF(rval);
-      if (verbose)
-        fprintf(stderr,
-                "INFO: Conv is reusing the 'out' argument"
-                " structure.\n");
-    }
-    else
-    {
-      if (out && verbose)
-        fprintf(stderr,
-                "INFO: Conv is ignoring 'out' argument with wrong"
-                " structure.\n");
-      else if(verbose)
-        fprintf(stderr,
-                "INFO: Conv don't have an 'out' argument"
-                " structure.\n");
-
-      rval = pygpu_zeros(4, out_dim,
-                         img->ga.typecode, GA_C_ORDER,
-                         img->context, Py_None);
-      //rval might be null
-    }
-    if ((rval==NULL)
-        || ((mode==ConvMode_VALID) && PyGpuArray_conv_valid(img, kern, rval,
-                                                            subsample_rows,
-                                                            subsample_cols,
-                                                            version, verbose,
-                                                            max_threads_dim0))
-        || ((mode==ConvMode_FULL) && PyGpuArray_conv_full(img, kern, rval,
-                                                          subsample_rows,
-                                                          subsample_cols,
-                                                          version, verbose,
-                                                          max_threads_dim0))
-            )
-    {
-        // if rval is something we just allocated,
-        // and there was a problem, then we have to free it.
-        Py_XDECREF(rval);
-        return NULL;
-    }
-    return (PyObject*)rval;
-}
--- a/theano/sandbox/gpuarray/conv.py
+++ b/theano/sandbox/gpuarray/conv.py
-import copy
-import os
-
-from theano import gof
-
-try:
-    from pygpu import gpuarray
-except ImportError:
-    pass
-
-from .type import GpuArrayType
-from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
-                        infer_context_name)
-from theano.gof import utils
-
-
-class GpuConv(GpuKernelBase, gof.Op):
-    """
-    Implement the batched and stacked 2d convolution on the gpu.
-
-    Parameters
-    ----------
-    version
-        Each version of c_code implements many kernels for the convolution.
-        By default we try to guess the best one. You can force one version with
-        this parameter. This parameter is used by the tests.
-    direction_hint
-        'forward', 'bprop weights' or 'bprop inputs'. Serves as a hint for graph
-        optimizers replacing GpuConv by other implementations. If the GpuConv is
-        inserted automatically, we take its value from ConvOp.
-    verbose
-        For value of 1,2 and 3. Print more information during the execution of
-        the convolution. Mostly used for optimization or debugging.
-    kshp
-        The size of the kernel. If provided, can generate faster code. If the
-        GpuConv op is automatically inserted, we take its value automatically
-        from the Conv op.
-    imshp
-        The size of the image. Not used for code generation but allows to select
-        an experimental new version in another repo.
-    max_threads_dim0
-        The maximum number of threads for the block size dimensions 0
-        (blockDim.x) used by the GPU function.
-    nkern
-        The number of kernels. Not used for this op, but can be used by graph
-        optimizers to select a more optimal convolution implementation. If the
-        GpuConv op is inserted automatically, we take its value from the Conv
-        op.
-    bsize
-        The batch size. Not used for this op, but can be used by graph
-        optimizers to select a more optimal convolution implementation. If the
-        GpuConv op is inserted automatically, we take its value from the Conv
-        op.
-    fft_opt
-        Deactivate fft_opt optimization at the op level when set to False. Note
-        that by default fft optimization aren't enabled.
-        See :ref:`convolution documentation <libdoc_tensor_nnet_conv>` to enable
-        them.
-
-    """
-    __props__ = ('border_mode', 'subsample', 'logical_img_hw',
-                 'logical_kern_hw', 'logical_kern_align_top', 'version',
-                 'verbose', 'kshp', 'imshp', 'max_threads_dim0')
-
-    @staticmethod
-    def logical_output_shape_2d(imshp, kshp, mode):
-        if mode == 'valid':
-            return imshp[0] - kshp[0] + 1, imshp[1] - kshp[1] + 1
-        if mode == 'full':
-            return imshp[0] + kshp[0] - 1, imshp[1] + kshp[1] - 1
-        raise ValueError(mode)
-
-    def __init__(self, border_mode, subsample=(1, 1),
-                 logical_img_hw=None, logical_kern_hw=None,
-                 logical_kern_align_top=True,
-                 version=-1, direction_hint=None,
-                 verbose=0, kshp=None, imshp=None,
-                 max_threads_dim0=None,
-                 nkern=None, bsize=None, fft_opt=True):
-        self.border_mode = border_mode
-        self.subsample = subsample
-        if logical_img_hw is not None:
-            h, w = logical_img_hw
-            # TODO: reconsider this... since shapes are not given in
-            # constructor, maybe a multiplier + offset is a more
-            # appropriate way of passing this logical grid
-            logical_img_hw = tuple(logical_img_hw)
-        self.logical_img_hw = logical_img_hw
-        if logical_kern_hw is not None:
-            h, w = logical_kern_hw
-            # TODO: reconsider this... since shapes are not given in
-            # constructor, maybe a multiplier + offset is a more
-            # appropriate way of passing this logical grid
-            logical_kern_hw = tuple(logical_kern_hw)
-        self.logical_kern_hw = logical_kern_hw
-        self.logical_kern_align_top = logical_kern_align_top
-        self.version = version
-        self.direction_hint = direction_hint
-        self.verbose = verbose
-        self.kshp = kshp
-        self.imshp = imshp
-        self.max_threads_dim0 = max_threads_dim0
-        self.nkern = nkern
-        self.bsize = bsize
-        self.fft_opt = fft_opt
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        if not hasattr(self, "imshp"):
-            self.imshp = None
-        if not hasattr(self, "max_threads_dim0"):
-            self.max_threads_dim0 = None
-        if not hasattr(self, "direction_hint"):
-            self.direction_hint = None
-        if not hasattr(self, "nkern"):
-            self.nkern = None
-        if not hasattr(self, "bsize"):
-            self.bsize = None
-        if not hasattr(self, "fft_opt"):
-            self.fft_opt = True
-
-    def make_node(self, img, kern):
-        if img.dtype != "float32" or kern.dtype != "float32":
-            raise NotImplementedError("GpuConv currently only work"
-                                      " with float32 dtype")
-        if img.type.ndim != 4:
-            raise TypeError('img must be 4D tensor')
-        if kern.type.ndim != 4:
-            raise TypeError('kern must be 4D tensor')
-        ctx_name = infer_context_name(img, kern)
-        img = as_gpuarray_variable(img, ctx_name)
-        kern = as_gpuarray_variable(kern, ctx_name)
-        broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
-                         False, False]
-        out = GpuArrayType(img.dtype, broadcastable, context_name=ctx_name)()
-        return gof.Apply(self, [img, kern], [out])
-
-    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def flops(self, inputs, outputs):
-        """
-        Useful with the hack in profilemode to print the MFlops.
-
-        """
-        images, kerns = inputs
-        out, = outputs
-        assert images[1] == kerns[1]
-        flops = 0
-        if self.border_mode == "valid":
-            # nb mul and add by output pixel
-            flops = kerns[2] * kerns[3] * 2
-            # nb flops by output image
-            flops *= out[2] * out[3]
-            # nb patch multiplied
-            flops *= images[1] * kerns[0] * images[0]
-        else:
-            flops = (images[0] * kerns[0] * images[1] *
-                     kerns[2] * kerns[3] *
-                     images[2] * images[3] * 2)
-        return flops
-
-    def make_thunk(self, node, storage_map, compute_map, no_recycling):
-        node_ = copy.copy(node)
-        assert node.op is node_.op
-        if node_.op.max_threads_dim0 is None:
-            node_.op.max_threads_dim0 = node_.inputs[0].type.context.maxlsize
-        return super(GpuConv, node_.op).make_thunk(node_, storage_map,
-                                                   compute_map, no_recycling)
-
-    def c_compile_args(self):
-        nb = 0
-        if self.kshp is not None:
-            nb = self.kshp[1]
-        return ['-DTHEANO_KERN_WID=' + str(nb)]
-
-    def c_headers(self):
-        return ['<stdio.h>', '<numpy_compat.h>', '<gpuarray/types.h>']
-
-    def c_code_cache_version(self):
-        # raise this whenever modifying any of the support_code_files
-        return (0, 23)
-
-    def c_code(self, node, nodename, inp, out_, sub):
-        if node.inputs[0].type.context.kind != "cuda":
-            raise NotImplementedError("GpuConv only works for cuda devices")
-        img, kern = inp
-        out, = out_
-        dx = self.subsample[0]
-        dy = self.subsample[1]
-        border_mode = self.border_mode
-        version = self.version
-        verbose = self.verbose
-        sub = sub.copy()
-        max_threads_dim0 = self.max_threads_dim0
-        if max_threads_dim0 is None:
-            raise NotImplementedError("GpuConv.c_code should not be called "
-                                      "directly. It should be called by "
-                                      "make_thunk() that add some information "
-                                      "related to the selected GPU.")
-        sub.update(locals())
-        return """
-    //Mandatory args
-    const char *mode_str = "%(border_mode)s";
-
-    //Optional args
-    int version = %(version)s;
-    int verbose = %(verbose)s;
-    size_t dx = %(dx)s;
-    size_t dy = %(dy)s;
-
-    int mode;
-    if (strcmp(mode_str, "full") == 0)
-    {
-        mode = ConvMode_FULL;
-    }
-    else if (strcmp(mode_str, "valid") == 0)
-    {
-        mode = ConvMode_VALID;
-    }
-    else
-    {
-        PyErr_SetString(PyExc_ValueError,
-                        "mode must be one of 'full' or 'valid'");
-        return 0;
-    }
-
-    // TODO, make out be decref before we alloc out2!
-    PyGpuArrayObject * out2 = (PyGpuArrayObject *)PyGpuArray_Conv(
-                                                         %(img)s, %(kern)s,
-                                                         %(out)s, mode,
-                                                         dx, dy,
-                                                         version, verbose,
-                                                         %(max_threads_dim0)s);
-    Py_XDECREF(%(out)s);
-    %(out)s = out2;
-
-    if (%(out)s==NULL){
-        %(fail)s
-    }
-""" % sub
-
-    def c_support_code_apply(self, node, name):
-        nb = 0
-        if self.kshp is not None:
-            nb = self.kshp[1]
-        kernels = self.gpu_kernels(node, name)
-        k = kernels[0]
-        code = """
-        #define THEANO_KERN_WID %(nb)d
-        """ % locals()
-        code += "\n".join([open(os.path.join(os.path.split(__file__)[0], f)).read()
-                           for f in ["conv_kernel.cu", "conv_full_kernel.cu"]])
-        gk = gpuarray.GpuKernel(code, k.name, k.params, **k.flags)
-        bin = gk._binary
-        bcode = ','.join(hex(ord(c)) for c in bin)
-        code = code.replace('\\', '\\\\')
-        code = code.replace('"', '\\"')
-        code = code.replace('\n', '\\n')
-        mod = """
-        static const char conv_bcode[] = {%(bcode)s};
-        static const char *conv_code = "%(code)s";
-        """ % locals()
-        return mod
-
-    def c_support_code_struct(self, node, name):
-        mod = GpuKernelBase.c_support_code_struct(self, node, name)
-        with open(os.path.join(os.path.split(__file__)[0], "conv.cu")) as f:
-            mod += f.read()
-        return mod
-
-    @utils.memoize
-    def gpu_kernels(self, node, name):
-        dtypes = [i.dtype for i in node.inputs]
-        dtypes.extend([o.dtype for o in node.outputs])
-        flags = Kernel.get_flags(*dtypes)
-        kernels = self.conv_patch_kernels(name, flags)
-        kernels.extend(self.conv_patch_stack_kernels(name, flags))
-        kernels.extend(self.conv_patch_stack_reduce_kernels(name, flags))
-        kernels.extend(self.conv_rows_kernels(name, flags))
-        kernels.extend(self.conv_rows_stack_kernels(name, flags))
-        kernels.extend(self.conv_rows_stack2_kernels(name, flags))
-        kernels.extend(self.conv_valid_row_reduce_kernels(name, flags))
-        kernels.extend(self.conv_reference_valid_kernels(name, flags))
-        kernels.extend(self.conv_reference_full_kernels(name, flags))
-        kernels.extend(self.conv_full_patch_kernels(name, flags))
-        kernels.extend(self.conv_full_patch_stack_kernels(name, flags))
-        kernels.extend(self.conv_full_patch_stack_padded_kernels(name, flags))
-        kernels.extend(self.conv_full_load_everything_kernels(name, flags))
-        return kernels
-
-    def conv_patch_kernels(self, name, flags):
-        kname = "conv_patch_%d"
-        k_var = "conv_patch_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [2, 3]
-            ]
-
-    def conv_patch_stack_kernels(self, name, flags):
-        kname = "conv_patch_stack_%d"
-        k_var = "conv_patch_stack_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in range(64, 96)
-            ]
-
-    def conv_patch_stack_reduce_kernels(self, name, flags):
-        kname = "conv_patch_stack_reduce_%d"
-        k_var = "conv_patch_stack_reduce_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15]
-            ]
-
-    def conv_rows_kernels(self, name, flags):
-        kname = "conv_rows_%d"
-        k_var = "conv_rows_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [0, 1]
-            ]
-
-    def conv_rows_stack_kernels(self, name, flags):
-        kname = "conv_rows_stack_%d"
-        k_var = "conv_rows_stack_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [0, 1]
-            ]
-
-    def conv_rows_stack2_kernels(self, name, flags):
-        kname = "conv_rows_stack2_%d"
-        k_var = "conv_rows_stack2_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [0, 1, 2, 3]
-            ]
-
-    def conv_valid_row_reduce_kernels(self, name, flags):
-        kname = "conv_valid_row_reduce_%d"
-        k_var = "conv_valid_row_reduce_%d_" + name
-        params = [
-            'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [0, 1]
-            ]
-
-    def conv_reference_valid_kernels(self, name, flags):
-        kname = "conv_reference_valid"
-        k_var = "conv_reference_valid_" + name
-        params = [
-            'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname, flags,
-                   'conv_code', 'conv_bcode', k_var)
-            ]
-
-    def conv_reference_full_kernels(self, name, flags):
-        kname = "conv_reference_full"
-        k_var = "conv_reference_full_" + name
-        params = [
-            'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname, flags,
-                   'conv_code', 'conv_bcode', k_var)
-            ]
-
-    def conv_full_patch_kernels(self, name, flags):
-        kname = "conv_full_patch"
-        k_var = "conv_full_patch_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname, flags,
-                   'conv_code', 'conv_bcode', k_var)
-            ]
-
-    def conv_full_patch_stack_kernels(self, name, flags):
-        kname = "conv_full_patch_stack_%d"
-        k_var = "conv_full_patch_stack_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [0, 1, 2, 3]
-            ]
-
-    def conv_full_patch_stack_padded_kernels(self, name, flags):
-        kname = "conv_full_patch_stack_padded_%d"
-        k_var = "conv_full_patch_stack_padded_%d_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname % i, flags,
-                   'conv_code', 'conv_bcode', k_var % i)
-            for i in [0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14]
-            ]
-
-    def conv_full_load_everything_kernels(self, name, flags):
-        kname = "conv_full_load_everything"
-        k_var = "conv_full_load_everything_" + name
-        params = [
-            gpuarray.GpuArray, 'uintp', gpuarray.GpuArray, 'uintp',
-            gpuarray.GpuArray, 'uintp',
-            'intc', 'intc', 'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc',
-            'intc', 'intc', 'intc', 'intc'
-            ]
-        return [
-            Kernel(None, params, kname, flags,
-                   'conv_code', 'conv_bcode', k_var)
-            ]
--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
-extern __shared__ float s_data[];
-
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len/nb_split
-//grid block size=batch_id
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-extern "C" __global__ void
-conv_full_patch_split(const float* img, const size_t img_offset,
-                      const float* kern, const size_t kern_offset,
-                      float* out, const size_t out_offset,
-                      int img_len, int img_wid, int kern_len, int kern_wid, int nb_split)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-    int batch_id = blockIdx.x;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    img+=img_len*img_wid*batch_id;//the good batch
-
-    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid);
-    __syncthreads();
-
-    for(int out_row=ty;out_row<out_len;out_row+=out_len/nb_split){
-      float sum = 0.0f;
-      int img_row = out_row;
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        int inverse_row = (img_row-row);
-        if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
-
-        const float* idx_in=&d_img[inverse_row*img_wid];
-        const float* idx_kern=&d_kern[row*kern_wid];
-        int img_col = out_col;
-        int col=0,last=0;
-        for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
-          if(last<0 ||last>=(img_wid))continue;//col outside the image        
-          sum+=idx_in[last]*idx_kern[col];
-        }
-      }
-      out[batch_id*out_len*out_wid+//the output image
-          out_row*out_wid+out_col] = sum;
-    }
-}
-
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len
-//grid block size=batch_id, nkern
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-extern "C" __global__ void
-conv_full_patch( const float* img, const size_t img_offset,
-                 const float* kern, const size_t kern_offset,
-                 float* out, const size_t out_offset,
-                 int img_len, int img_wid,
-                 int kern_len, int kern_wid, int nkern, int nstack)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-    int batch_id = blockIdx.x;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    kern+=kern_len*kern_wid*nstack*blockIdx.y;//the good nkern
-    img+=img_len*img_wid*batch_id;//the good batch
-
-    load_to_shared(d_img, img, thread_id, nb_thread_id, img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_len*kern_wid, true);
-    __syncthreads();
-
-    float sum = 0.0f;
-
-    for (int row=0; row < kern_len; row++) {//loop over row
-      if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-
-      const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-      const float* idx_kern=&d_kern[row*kern_wid];
-      int col=0;
-      int max_col=kern_wid;
-      int img_col=out_col-kern_wid+1;
-      max_col=min(max_col,img_wid-img_col);
-      
-      if(img_col<0){col=-img_col;img_col+=col;}
-      for (; col < max_col; col++, img_col++) {//loop over col
-        sum+=idx_in[col]*idx_kern[col];
-      }
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        out_wid*out_len*blockIdx.y+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-
-
-
-//we store the full image and the full kernel in the shared memory
-//each thread compute only one value for the output
-//thread block size=out_wid, out_len
-//grid block size=batch_id, nkern
-//dynamic shared memory: img_len*img_wid+kern_len*kern_wid
-//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
-
-template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
-__device__ inline void
-conv_full_patch_stack( const float* img, const size_t img_offset,
-                       const float* kern, const size_t kern_offset,
-                       float* out, const size_t out_offset,
-                       int img_len, int img_wid,
-                       int kern_len, int kern_wid, int nkern, int nstack,
-                       int img_stride_col, int img_stride_row,
-                       int kern_stride_col, int kern_stride_row, 
-                       int kern_stride_stack, int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.y*blockDim.x;//blockDim.z*
-  const float __shared__ *kern_, *img_;
-    const int batch_id = blockIdx.x;
-    const int nkern_id = blockIdx.y;
-
-
-    const int out_col = threadIdx.x;
-    const int out_row = threadIdx.y;
-    const int thread_id  = threadIdx.y*blockDim.x+ threadIdx.x;
-
-    float* d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float* d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    kern_=kern+kern_stride_nkern*nkern_id;//the good nkern
-    img_=img+img_len*img_stride_row*(nstack*batch_id);//the good batch
-
-    float sum = 0.0f;
-
-    for (int stack = 0;stack<nstack;stack++){
-
-      load_to_shared(d_img, img_+stack*img_len*img_stride_row, thread_id,nb_thread_id,img_wid,img_len,img_stride_col, img_stride_row,false,img_c_contiguous_2d);
-      load_to_shared(d_kern, kern_+stack*kern_stride_stack, thread_id,nb_thread_id,kern_wid,kern_len,kern_stride_col,kern_stride_row,true,kern_c_contiguous_2d);
-      __syncthreads();
-
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
-        const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
-        const float* idx_kern=&d_kern[row*kern_wid];
-        int col=0;
-        int max_col=kern_wid;
-        int img_col=out_col-kern_wid+1;
-        max_col=min(max_col,img_wid-img_col);
-
-        if(img_col<0){col=-img_col;img_col+=col;}
-        for (; col < max_col; col++, img_col++) {//loop over col
-          sum+=idx_in[col]*idx_kern[col];
-        }
-      }
-      //Needed as not all thread finish at the same time the loop
-      //And we don't want to overwrite the shared memory.
-      __syncthreads();
-    }
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        out_wid*out_len*blockIdx.y+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
-__global__ void \
-conv_full_patch_stack_##suffix( \
-    const float *img, const size_t img_offset, \
-    const float *kern, const size_t kern_offset, \
-    float *out, const size_t out_offset, \
-    int img_len, int img_wid, \
-    int kern_len, int kern_wid, int nkern, int nstack, \
-    int img_stride_col, int img_stride_row, \
-    int kern_stride_col, int kern_stride_row,  \
-    int kern_stride_stack, int kern_stride_nkern) \
-{ \
-    conv_full_patch_stack<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
-        img_stride_col, img_stride_row, kern_stride_col, kern_stride_row,  \
-        kern_stride_stack, kern_stride_nkern); \
-}
-
-__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
-__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
-
-#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
-}
-
-
-
-/**
- * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
- * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
- * thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
- * grid block size=batch_id, nkern
- * dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * dynamic shared memory: low mem:((kern_len+nb_row-1)+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template c_contiguous: if true, the image and kernel have are c_contiguous.(use less registers)
- * template split: if true, each thread compute more than 1 output pixel.
- * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
- *          if you set split and low_mem to true, we will use the low_mem version!
- */
-template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
-__device__ inline void
-conv_full_patch_stack_padded( const float* img, const size_t img_offset,
-                  const float* kern, const size_t kern_offset,
-                  float* out, const size_t out_offset,
-                  const int img_len, const int img_wid,
-                  const int kern_len, const int kern_wid,
-                  const int nkern, const int nstack,
-                  const int img_stride_col, const int img_stride_row,
-                  const int img_stride_stack, const int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
-    //the last two dimensions are c_contiguous but flipped!
-    kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    kern_stride_col=1;
-    kern_stride_row=kern_wid;
-  }
-
-  out_len = img_len + kern_len - 1;
-  out_wid = img_wid + kern_wid - 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-    __shared__ int batch_id, kern_id, img_wid_valid, nb_rows;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-    nb_rows = blockDim.y;
-
-    // Thread index
-    const int tx = threadIdx.x;
-    const int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-
-    float * d_kern=&s_data[0];//size of [KERNEL_LEN * KERNEL_WID];
-    float * d_img=&s_data[kern_len*kern_wid];//size of [see fct doc];
-
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-
-    img_wid_valid=img_wid+2*kern_wid-2;
-
-    if(!split && !low_mem){
-      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
-      const int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-             img+=img_stride_stack){
-          __syncthreads();
-        load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
-                                  thread_id,nb_thread_id,img_wid,img_len,
-                                  img_stride_col, img_stride_row, kern_wid-1,
-                                  c_contiguous);
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-
-        for (int row=0; row < kern_len; row++) {//loop over row
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-          
-          convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
-        }
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-    }else if(split && !low_mem){
-      fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
-      for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          __syncthreads();
-          //TODO: load only the part of the image needed or put the partial result in shared memory
-          load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
-                                    img+img_stride_stack*stack,
-                                    thread_id,nb_thread_id,img_wid,img_len,
-                                    img_stride_col, img_stride_row, kern_wid-1,
-                                    c_contiguous);
-          load_to_shared(d_kern, kern+kern_stride_stack*stack,
-                         thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
-          //Not needed on generation before as they worked anyway. Removing the if generate the good code
-          //as we store the result of only the good thread.
-          //This was with nvcc 3.0 on an GTX470 card.
-          if(out_row<out_len)
-            for (int row=0; row < kern_len; row++) {//loop over row
-              const float* idx_kern=&d_kern[row*kern_wid];
-              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
-              
-              convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
-            }
-          if(out_row<out_len)
-            out[batch_id*out_wid*out_len*nkern+//the good batch
-                out_wid*out_len*kern_id+//the output image
-                out_row*out_wid+out_col] = sum;
-        }
-      }
-    }else{//low_mem version
-      //don't need to fill the last rows padding as this is done later.
-      fill(d_img,img_wid_valid*((kern_len+nb_rows-1)+2*kern_len-2), 0, thread_id, nb_thread_id);
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
-      __shared__ int out_len_max;
-      //TODO pass a parameter nb_split
-      if(thread_id==0)
-        out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
-      __syncthreads();
-      for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
-          out_row+=nb_rows, out_row_iter++){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          __syncthreads();
-          const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
-          const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
-          //we need to reload some row as when we change of out_row we lost the last load du to the stack.
-          const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
-          load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
-                                    img+img_stride_stack*stack//the good stack image
-                                    +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
-                                    thread_id,nb_thread_id,img_wid,
-                                    len_to_load+previous_row,
-                                    img_stride_col, img_stride_row, kern_wid-1,
-                                    c_contiguous);
-          //TODO: fill the last row padding only when needed.
-          //We always fill the last rows padding event when not needed.
-          int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
-          row_to_fill = min(row_to_fill,kern_len-1);
-          fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
-               img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
-          load_to_shared(d_kern, kern+kern_stride_stack*stack,
-                         thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-
-          for (int row=0; row < kern_len; row++) {//loop over row
-            const float* idx_kern=&d_kern[row*kern_wid];
-            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
-            
-            convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
-          }
-        }
-        if(out_row<out_len)
-          out[batch_id*out_wid*out_len*nkern+//the good batch
-              out_wid*out_len*kern_id+//the output image
-              out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
-__global__ void \
-conv_full_patch_stack_padded_##suffix( \
-    const float *img, const size_t img_offset, \
-    const float *kern, const size_t kern_offset, \
-    float *out, const size_t out_offset, \
-    const int img_len, const int img_wid, \
-    const int kern_len, const int kern_wid, \
-    const int nkern, const int nstack, \
-    const int img_stride_col, const int img_stride_row, \
-    const int img_stride_stack, const int img_stride_batch, \
-    const int kern_stride_col, const int kern_stride_row, \
-    const int kern_stride_stack, const int kern_stride_nkern) \
-{ \
-    conv_full_patch_stack_padded<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
-        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
-        kern_stride_col, kern_stride_row, \
-        kern_stride_stack, kern_stride_nkern); \
-}
-
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
-__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
-
-#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
-}
-
-
-template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
-{ 
-    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
-    //return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
-}
-template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
-{ 
-    return 0;
-}
-template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
-{ 
-    return x[0] * y[0];
-}
-extern "C" __global__ void
-conv_full_load_everything( const float* img, const size_t img_offset,
-                 const float* kern, const size_t kern_offset,
-                 float* out, const size_t out_offset,
-                 int img_len, int img_wid,
-                 int kern_len, int kern_wid, int nkern, int nstack,
-                 int img_stride_col, int img_stride_row,
-                 int img_stride_stack, int img_stride_batch,
-                 int kern_stride_col, int kern_stride_row, 
-                 int kern_stride_stack, int kern_stride_nkern)
-{
-    int __shared__ out_len, out_wid, nb_thread_id;
-
-    kern = (const float *)(((const char *)kern)+kern_offset);
-    img = (const float *)(((const char *)img)+img_offset);
-    out = (float *)(((char *)out)+out_offset);
-
-    out_len = img_len + kern_len - 1;
-    out_wid = img_wid + kern_wid - 1;
-    nb_thread_id = blockDim.y*blockDim.x;
-
-    int batch_id = blockIdx.x;
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = threadIdx.y;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0]; //size [nstack * IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[nstack * img_len * img_wid];//size [nstack * KERNEL_LEN * KERNEL_WID];
-
-    img += blockIdx.x * img_stride_batch;//the good batch
-
-    // load the image to shared memory
-    for (int i = thread_id; i < nstack * img_len * img_wid; i += nb_thread_id)
-    {
-        int stack = i / (img_wid*img_len);
-        int row = (i % (img_wid*img_len)) / img_wid;
-        int col = (i % (img_wid*img_len)) % img_wid;
-        d_img[i] = img[stack*img_stride_stack +row*img_stride_row +col*img_stride_col];
-    }
-
-    for (int kern_idx = 0; kern_idx < nkern; ++kern_idx, kern += kern_stride_nkern)
-    {
-        // load the kernel into shared memory and flip it
-        for (int i = thread_id; i < nstack * kern_len * kern_wid; i += nb_thread_id)
-        {
-            int stack = i / (kern_wid*kern_len);
-            int row = (i % (kern_wid*kern_len)) / kern_wid;
-            int col = (i % (kern_wid*kern_len)) % kern_wid;
-            d_kern[stack*kern_len*kern_wid + (kern_len-1-row)*kern_wid + (kern_wid-1-col)]
-               = kern[stack*kern_stride_stack +row*kern_stride_row +col*kern_stride_col];
-        }
-        __syncthreads();
-
-        float sum = 0.0f;
-        for (int row=0; row < kern_len; ++row)
-        {
-            int irow = out_row - kern_len+1+row;
-            if (irow < 0 || irow > img_len) continue;
-            for (int col = 0; col < kern_wid; ++col)
-            {
-                int icol = out_col - kern_wid+1+col;
-                if (icol < 0 || icol > img_wid) continue;
-                if (THEANO_KERN_WID > 0)
-                {
-                    sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
-                            d_kern + row*kern_wid+col, kern_len*kern_wid);
-                }
-                else
-                {
-                    for (int stack = 0; stack < nstack; ++stack)
-                    {
-                        sum += d_img[stack*img_len*img_wid + irow*img_wid + icol] * d_kern[stack*kern_len*kern_wid+row*kern_wid+col];
-                    }
-                }
-            }
-        }
-        out[batch_id*out_wid*out_len*nkern+//the good batch
-            out_wid*out_len*kern_idx+//the output image
-            out_row*out_wid+out_col] = sum;
-        __syncthreads(); //don't start loading another kernel until we're done here
-    }
-}
-
-
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
-// REMEMBER TO INCREASE c_code_cache_version when changing this file
-//
-//implement the valid convolution only
-
-/*
-for (int iter_m=0; iter_m < Os[0]; iter_m++) {
-  // Reposition index into input image based on requested output size
-  int pos_m = iter_m*%(self_dx)s;//The position of the patch in the image
-  int new_m = (pos_m+dim_ker[0]-1);
-
-  for (int iter_n=0; iter_n < Os[1]; iter_n++) {  // loop over columns
-    int pos_n=iter_n*%(self_dy)s;
-    %(type)s sum=0;
-
-    // Sum over kernel, if index into image is out of bounds
-    // fill with the value
-    for (int j=0; j < dim_ker[0]; j++) {
-      int inverse_row = (new_m-j);
-      const %(type)s* idx_in=&in[inverse_row*dim_im[1]]; //JB: should be dim_im[1] right? (was dim_im[0])
-      const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
-      int new_n = (pos_n+dim_ker[1]-1);
-      for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
-        sum+=idx_kern[k]*idx_in[last];
-      }
-    }//for j
-    out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
-  }//for n
- }//for m
-*/
-#ifndef CONV_KERNEL_CU
-#define CONV_KERNEL_CU
-
-/*
-#define CHECK_BANK_CONFLICTS 0
-#if CHECK_BANK_CONFLICTS
-#define AS(i, j) cutilBankChecker(((float*)&As[0][0]), (BLOCK_SIZE * i + j))
-#define BS(i, j) cutilBankChecker(((float*)&Bs[0][0]), (BLOCK_SIZE * i + j))
-#else
-#define AS(i, j) As[i][j]
-#define BS(i, j) Bs[i][j]
-#endif
-*/
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b) )
-#define MAX(a, b) ((a) < (b) ? (b) : (a) )
-
-//Must be the same size as a ptr. We can't use unsigned long as on Windows 64
-//bit, it is 32 bit.
-const size_t COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
-
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
-  if (nb_thread < 64)
-    {
-      if(flipped)
-        //TODO very slow on device before 1.3.
-        //     make access to kern sequential and access to d_kern flipped.
-        for(int i=thread_id;i<N;i+=nb_thread)
-          dst[i]=src[N - 1 - i];
-        //dst[N-1-i]=src[i];
-      else
-      {
-        for(int i = thread_id; i < N; i += nb_thread)
-        {
-            dst[i] = src[i];
-        }
-      }
-    }
-  else
-    {
-      nb_thread = nb_thread & 0xFFFFFFE0; //make nb_thread a multiple of 32
-      // Global memory:
-      //  <-------------------------------------->
-      //      A      A      A      A      A   // points of 256-byte alignment
-      //         dddddddddddddddddddddd       // layout of src in global memory
-      if (thread_id < nb_thread)
-        {
-          const float * my_src_ptr = (const float *)(
-                  ((size_t)src) & COALESCED_ALIGN);
-          my_src_ptr += thread_id;
-          while (my_src_ptr < src + N)
-          {
-              if (my_src_ptr >= src)
-              {
-                  int i = my_src_ptr - src;
-                  if (flipped)
-                  {
-                      dst[N - 1 - i] = *my_src_ptr;
-                  }
-                  else
-                  {
-                      dst[i] = *my_src_ptr;
-                  }
-              }
-              my_src_ptr += nb_thread;
-          }
-        }
-    }
-}
-
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- */
-__device__ void load_to_shared(float * dst, const float * src, const int thread_id,
-                               int nb_thread, const int nb_col, const int nb_row,
-                               const int stride_col, const int stride_row,
-                               const bool flipped=false, const bool c_contiguous=true){
-    if (c_contiguous)
-    {
-        load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
-    }
-    else
-    {
-        if (flipped)
-        {
-            int LAST = nb_row * nb_col - 1;
-            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-            {
-                // XXX
-                // THIS IS SLOW - use whatever blocks are in the the
-                // threads to avoid division and modulo
-                dst[LAST - i] \
-                    = src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
-            }
-        }
-        else
-        {
-            for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
-            {
-                // XXX
-                // THIS IS SLOW - use whatever blocks are in the the
-                // threads to avoid division and modulo
-                dst[i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
-            }
-        }
-    }
-}
-
-__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
-  for(int i=thread_id;i<N;i+=nb_thread)
-    dst[i]=value;
-}
-
-/*
- * We load from global memory to shared memory. The outer if is optimized away at compilation.
- * We put the image at the center of another one. Usefull to padd an image with 0.
- */
-__device__ void load_padded_col_to_shared(float * dst, const float * src, 
-                                          const int thread_id, const int nb_thread,
-                                          const int nb_col, const int nb_row, 
-                                          const int stride_col, const int stride_row,
-                                          const int wid_pad, const bool c_contiguous=true){
-  if(c_contiguous){//flipped==false
-    for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
-      int col=i%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[i];
-    }
-    
-  }else{
-    for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread){
-      int col=i%nb_col;
-      int row=i/nb_col;
-      dst[row*(nb_col+2*wid_pad)+col+wid_pad]=src[row*stride_row+col*stride_col];
-    }
-  }
-
-}
-
-template<int i> __device__ float convolutionRowNoFlip(const float *data,
-                                                      const float *kern){
-    return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
-  //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
-}
-
-template<> __device__ float convolutionRowNoFlip<1>(const float *data,
-                                                    const float *kern){
-    return data[0]*kern[0];
-}
-template<> __device__ float convolutionRowNoFlip<0>(const float *data,
-                                                    const float *kern){
-    return 0;
-}
-
-__device__ void convolutionRowNoFlip(float& sum,
-                                     const float *data,
-                                     const float *kern, const int kern_wid){
-  if(THEANO_KERN_WID>0)
-    sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
-  else
-#pragma unroll 8
-    for (int col=0; col < kern_wid; col++) {//loop over col
-      sum+=data[col]*kern[col];
-    }
-}
-
-template<bool accumulate>
-__device__ void store_or_accumulate(float& dst,const float value ){
-  if(accumulate){
-    dst += value;
-  }else
-    dst = value;
-}
-
-
-/**
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * Don't implement the stack.
- * each thread compute only one value for the output if split is false
- * thread block size=out_wid, out_len(or less then out_len if split is true)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+kern_len*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template split: if true, each thread computes more than 1 output pixel
- *                 When true, allow for output image bigger then 512 pixel.
- *                 Use more registers.
- */
-template<bool flipped_kern, bool split>
-__device__ inline void
-conv_patch( const float* img, const size_t img_offset,
-            const float* kern, const size_t kern_offset,
-            float* out, const size_t out_offset,
-            int img_len, int img_wid, int kern_len, int kern_wid,
-            int nkern, int nstack)
-{
-  int __shared__ out_len, out_wid, nb_thread_id;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    __shared__ int batch_id, kern_id;
-    batch_id = blockIdx.x;
-    kern_id = blockIdx.y;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    const int thread_id  = ty*blockDim.x + tx;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    kern+=kern_len*kern_wid*nstack*kern_id;
-
-    img+=img_len*img_wid*(nstack*batch_id);
-
-    load_to_shared(d_img, img, thread_id,nb_thread_id,img_len*img_wid);
-    load_to_shared(d_kern, kern, thread_id,nb_thread_id,kern_len*kern_wid,flipped_kern);
-    __syncthreads();
-
-    if(!split){
-      int out_row = ty;//output row
-      float sum = 0.0f;
-      for (int row=0; row < kern_len; row++) {//loop over row
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          blockIdx.y*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-    }else{
-      for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
-        float sum = 0.0f;
-        for (int row=0; row < kern_len; row++) {//loop over row
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-        }
-        out[batch_id*out_wid*out_len*nkern+//the good batch
-            kern_id*out_wid*out_len+//the output image
-            out_row*out_wid+out_col] = sum;
-      }
-    }
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
-__global__ void \
-conv_patch_##suffix(const float *img, const size_t img_offset, \
-                    const float *kern, const size_t kern_offset, \
-                    float *out, const size_t out_offset, \
-                    int img_len, int img_wid, int kern_len, int kern_wid, \
-                    int nkern, int nstack) \
-{ \
-    conv_patch<__VA_ARGS__>(img, img_offset, kern, kern_offset, \
-                            out, out_offset, img_len, img_wid, kern_len, \
-                            kern_wid, nkern, nstack); \
-}
-
-__INSTANTIATE_CONV_PATCH(2, true, false)
-__INSTANTIATE_CONV_PATCH(3, true, true)
-
-#undef __INSTANTIATE_CONV_PATCH
-}
-
-
-
-/**
- * As conv_patch, but implement the stack in the kernel.
- * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * each thread compute only one value for the output if split==false else it compute more than 1 values
- * thread block size=out_wid, out_len/X (X is any number, optimized value is ceil(out_len/N)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+(preload_full_kern?KERNEL_LEN:1)*kern_wid
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * dx: patch stride rows(1 for normal convolution)
- * dy: patch stride cols(1 for normal convolution)
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template accumulate: if true, we add the result, else we override the result
- * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
- * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
- * template split: if true, each thread generate more than 1 output pixel, but use more registers.
- * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
- * template subsample: if false, remove some computation needed when dx or dy!=1.
- */
-template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
-__device__ inline void
-conv_patch_stack( const float* img, const size_t img_offset,
-                  const float* kern, const size_t kern_offset,
-                  float* out, const size_t out_offset,
-                  int img_len, int img_wid, int kern_len, int kern_wid,
-                  int out_len, int out_wid,
-                  int nkern, int nstack, int img_stride_col,int img_stride_row,
-                  int img_stride_stack, int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
-{
-  int __shared__ nb_thread_id;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-    int kern_id = blockIdx.y;
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = out_row*out_wid + out_col;
-
-    float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
-    float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
-
-    if(!split){
-      kern+=kern_stride_nkern*kern_id;//the good nkern
-      img+=img_stride_batch*batch_id;//the good batch
-      float sum = 0.0f;
-      
-      for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-             img+=img_stride_stack){
-        load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
-                       img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-        if(preload_full_kern)
-          load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                         kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-
-        __syncthreads();
-        
-        for (int row=0; row < kern_len; row++) {//loop over row
-          if(!preload_full_kern){
-            __syncthreads();
-            int idx2;
-            if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
-            else idx2=(row)*kern_stride_row;
-            load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
-                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-            __syncthreads();              
-          }
-
-          const float* idx_kern;
-          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-          else idx_kern=d_kern;
-          const float* idx_in;
-          if(subsample)
-            idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-          else
-            idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          
-          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-        }
-        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-      }
-      store_or_accumulate<accumulate>(
-                                      out[batch_id*out_wid*out_len*nkern+//the good batch
-                                          out_wid*out_len*kern_id+//the output image
-                                          out_row*out_wid+out_col],sum);
-    }else{
-
-      const float __shared__ *kern_, *img_;
-      int __shared__ out_len_max;
-
-      kern_=kern+kern_stride_nkern*kern_id;//the good nkern
-      img_=img+img_stride_batch*batch_id;//the good batch
-      //out_len_max must by higher then out_len as we need all thread when we load the image as the blockDim.y is not always a multiple of out_len.
-      out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
-
-      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
-      //TODO: do this happen elsewhere?
-      for(;out_row<out_len_max;out_row+=blockDim.y){
-        float sum = 0.0f;
-        for (int stack = 0;stack<nstack;stack++){
-          //TODO: load only the part of the image needed or put the partial result in shared memory
-          int idx1=img_stride_stack*stack;
-          load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
-                         img_stride_col, img_stride_row, false, img_c_contiguous_2d);
-          if(preload_full_kern){
-            int idx2=kern_stride_stack*stack;
-            load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
-                           kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-          }
-          __syncthreads();
-          
-          for (int row=0; row < kern_len; row++) {//loop over row
-            if(!preload_full_kern){
-              __syncthreads();
-              int idx2=kern_stride_stack*stack;
-              if(flipped_kern)
-                idx2+=(kern_len-row-1)*kern_stride_row;
-              else
-                idx2+=(row)*kern_stride_row;
-              load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
-                             kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
-              __syncthreads();              
-            }
-            const float* idx_kern;
-            if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-            else idx_kern=d_kern;
-            const float* idx_in;
-            if(subsample)
-              idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
-            else
-              idx_in=&d_img[(row+out_row)*img_wid+out_col];
-            
-            //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-            //Not needed on generation before as they worked anyway. Removing the if generate the good code
-            //as we store the result of only the good thread.
-            //This was with nvcc 3.0 on an GTX470 card.
-            if(out_row<out_len)
-              convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-          }
-          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-        }
-        if(out_row<out_len)
-          store_or_accumulate<accumulate>(
-                                          out[batch_id*out_wid*out_len*nkern+//the good batch
-                                              out_wid*out_len*kern_id+//the output image
-                                              out_row*out_wid+out_col],sum);
-      }
-
-    }
-
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
-__global__ void \
-conv_patch_stack_##suffix(const float *img, const size_t img_offset, \
-                          const float *kern, const size_t kern_offset, \
-                          float *out, const size_t out_offset, \
-                          int img_len, int img_wid, int kern_len, int kern_wid, \
-                          int out_len, int out_wid, int nkern, int nstack, \
-                          int img_stride_col, int img_stride_row, \
-                          int img_stride_stack, int img_stride_batch, \
-                          int kern_stride_col, int kern_stride_row, \
-                          int kern_stride_stack, int kern_stride_nkern, \
-                          int dx, int dy) \
-{ \
-    conv_patch_stack<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, out_len, \
-        out_wid, nkern, nstack, img_stride_col, img_stride_row, \
-        img_stride_stack, img_stride_batch, \
-        kern_stride_col, kern_stride_row, \
-        kern_stride_stack, kern_stride_nkern, dx, dy); \
-}
-
-__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
-__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
-__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
-__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
-__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
-
-#undef __INSTANTIATE_CONV_PATCH_STACK
-}
-
-
-
-/**
- * As conv_patch_stack, but kern_len thread for each output pixel
- * I keep it separated as use more register.
- * Implementation of the valid convolution that keep the full image and the full kernel in shared memory
- * thread block size=out_wid, out_len, ceil_intdiv(kern_len/nb_split)
- * grid block size=batch_id, nkern
- * dynamic shared memory: img_len*img_wid+kern_wid*(preload_full_kern?kern_len:thread_z)+out_size*thread_z
- * 
- * nkern: the number of kernel, used to compute the output image to store the result
- * nstack: the size of the stack, used to compute the image to load.
- * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
- * template img_contiguous: if true, the img have are collon and row contiguous
- * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
- */
-template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
-__device__ inline void
-conv_patch_stack_reduce( const float* img, const size_t img_offset,
-                  const float* kern, const size_t kern_offset,
-                  float* out, const size_t out_offset,
-                  int img_len, int img_wid, int kern_len, int kern_wid,
-                  int nkern, int nstack, int img_stride_col,int img_stride_row,
-                  int img_stride_stack, int img_stride_batch,
-                  int kern_stride_col, int kern_stride_row,
-                  int kern_stride_stack, int kern_stride_nkern)
-{
-  //int __shared__ out_len, out_wid, nb_thread_id;
-  //out_len = img_len - kern_len + 1;
-  //out_wid = img_wid - kern_wid + 1;
-  const int out_wid = blockDim.x;
-  const int out_len = blockDim.y;
-  const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  if(kern_stride_col==-1 && kern_stride_row==-kern_wid){
-    //the last two dimensions are c_contiguous but flipped!
-    kern = &(kern[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
-    kern_stride_col=1;
-    kern_stride_row=kern_wid;
-  }
-
-  extern __shared__ float s_data[];
-
-    int batch_id = blockIdx.x;
-
-    // Thread index
-    int tx = threadIdx.x;
-    int ty = threadIdx.y;
-    int tz = threadIdx.z;
-
-    int out_col = tx;//output col
-    int out_row = ty;//output row
-    const int thread_id  = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
-
-    //d_img size [IMAGE_LEN * IMAGE_WID];
-    float * d_img=&s_data[0];
-
-    //d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
-    float * d_kern=&s_data[img_len * img_wid];
-
-    //d_reduce size [n_threads]
-    //N.B. this overlaps with d_img and d_kern!
-    float * d_reduce=&s_data[0];
-
-    float sum = 0.0f;
-
-    kern+=kern_stride_nkern*blockIdx.y;//the good nkern
-    img+=img_stride_batch*batch_id;//the good batch
-
-    for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
-           img+=img_stride_stack){
-      __syncthreads();
-      load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
-                     img_stride_col, img_stride_row, false, c_contiguous);
-      if(split && ! preload_full_kern){
-        for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
-            //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
-            // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
-            // details, but that seemed really weird. tricky bug to find too.
-          int idx3 = flipped_kern
-              ? max((kern_len - (int)blockDim.z - first_row),0)
-              : first_row;
-          int len3 = min(blockDim.z, kern_len - first_row);
-
-          __syncthreads();
-          load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
-                         kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-          __syncthreads();
-          const float* idx_kern=&d_kern[tz*kern_wid];
-          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
-          float sum2 = 0;
-          if(tz<len3)
-            convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
-          sum+=sum2;
-        }
-      }else if(split){
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        for(int row=tz;row<kern_len;row+=blockDim.z){
-          const float* idx_kern=&d_kern[row*kern_wid];
-          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-        }
-      }else{
-        int row = tz;//The row of the kernel.
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
-                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
-        __syncthreads();
-        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-      }
-        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
-    }
-
-    //reduce no sync because previous loop ends with sync
-    d_reduce[thread_id]=sum;
-    __syncthreads();
-    if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
-      //sum=0;
-      for(int i=1;i<blockDim.z;i++){
-        sum+=d_reduce[thread_id+i*out_wid*out_len];
-      }
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          out_wid*out_len*blockIdx.y+//the output image
-          out_row*out_wid+out_col] = sum;
-    }
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
-__global__ void \
-conv_patch_stack_reduce_##suffix( \
-    const float *img, const size_t img_offset, \
-    const float *kern, const size_t kern_offset, \
-    float *out, const size_t out_offset, \
-    int img_len, int img_wid, int kern_len, int kern_wid, \
-    int nkern, int nstack, int img_stride_col, int img_stride_row, \
-    int img_stride_stack, int img_stride_batch, \
-    int kern_stride_col, int kern_stride_row, \
-    int kern_stride_stack, int kern_stride_nkern) \
-{ \
-    conv_patch_stack_reduce<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
-        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
-        kern_stride_col, kern_stride_row, \
-        kern_stride_stack, kern_stride_nkern); \
-}
-
-/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(0, false, false, false, false)*/
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true)
-/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE#(4, false, true, false, false)*/
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true)
-/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false)*/
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true)
-/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false)*/
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false)
-__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true)
-
-#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
-}
-
-
-
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * we store kern_len row of the image and the full kernel in the shared memory
- * each thread compute only one value for the output
- * Don't implement the stack and nkern in the kernel.
- * thread block size=out_wid
- * grid block size=out_len,batch_id
- * dynamic shared memory: kern_len*img_wid+kern_len*kern_wid
- * Diff with conv_patch: don't store the full image in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<bool c_contiguous>
-__device__ inline void
-conv_rows( const float* img, const size_t img_offset,
-           const float* kern, const size_t kern_offset,
-           float* out, const size_t out_offset,
-           int img_len, int img_wid, int kern_len, int kern_wid,
-           int nkern, int nstack,
-           int img_stride_col, int img_stride_row,
-           int img_stride_stack, int img_stride_batch,
-           int kern_stride_col, int kern_stride_row,
-           int kern_stride_stack, int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
-  float __shared__ *d_img, *d_kern;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-
-  extern __shared__ float s_data[];
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x;;//output row
-    const int thread_id = threadIdx.x;
-
-    d_img=&s_data[0];//size of [KERN_LEN * IMAGE_WID];
-    d_kern=&s_data[kern_len * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-    
-    img+=img_stride_batch*batch_id;//selection the good image from the batch
-    img+=out_row*img_stride_row;//select the good top row.
-    kern+=kern_stride_nkern*kern_id;//the good nkern
-
-    load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
-                   img_stride_col, img_stride_row, false, c_contiguous);
-    load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
-                   kern_stride_col, kern_stride_row, true, c_contiguous);
-
-    __syncthreads();
-    float sum = 0.0f;
-
-    for (int row=0; row < kern_len; row++) {//loop over row
-      const float* idx_kern=&d_kern[row*kern_wid];
-      const float* idx_in=&d_img[(row)*img_wid+out_col];
-      convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-    }
-
-    out[batch_id*out_wid*out_len*nkern+//the good batch
-        kern_id*out_wid*out_len+//the output image
-        out_row*out_wid+out_col] = sum;
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
-__global__ void \
-conv_rows_##suffix(const float *img, const size_t img_offset, \
-                   const float *kern, const size_t kern_offset, \
-                   float *out, const size_t out_offset, \
-                   int img_len, int img_wid, int kern_len, int kern_wid, \
-                   int nkern, int nstack, \
-                   int img_stride_col, int img_stride_row, \
-                   int img_stride_stack, int img_stride_batch, \
-                   int kern_stride_col, int kern_stride_row, \
-                   int kern_stride_stack, int kern_stride_nkern) \
-{ \
-    conv_rows<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, \
-        nkern, nstack, img_stride_col, img_stride_row, \
-        img_stride_stack, img_stride_batch, \
-        kern_stride_col, kern_stride_row, \
-        kern_stride_stack, kern_stride_nkern); \
-}
-
-__INSTANTIATE_CONV_ROWS(0, false)
-__INSTANTIATE_CONV_ROWS(1, true)
-
-#undef __INSTANTIATE_CONV_ROWS
-}
-
-
-
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * as conv_rows, but implement the stack. Separate as this use more register.
- * we store kern_len row of the image and the full kernel in the shared memory
- * each thread compute only one value for the output
- * thread block size=out_wid, block_len
- * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
- * dynamic shared memory: (kern_len+block_len-1)*img_wid+kern_len*kern_wid
- * Diff with conv_patch: don't store the full image in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<bool c_contiguous>
-__device__ inline void
-conv_rows_stack( const float* img, const size_t img_offset,
-                 const float* kern, const size_t kern_offset,
-                 float* out, const size_t out_offset,
-                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-                 const int nkern, const int nstack,
-                 const int img_stride_col, const int img_stride_row,
-                 const int img_stride_stack, const int img_stride_batch,
-                 const int kern_stride_col, const int kern_stride_row,
-                 const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
-  float  __shared__ *d_img, *d_kern;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  nb_rows = blockDim.y;
-
-  int rows_to_read = MIN(
-          kern_len + nb_rows - 1,
-          img_len - blockIdx.x * nb_rows);
-
-  /**
-   * Every thread ultimately computes one value in the output, at coordinates
-   *   out[ batch_id, kern_id, out_row, out_col]
-   *
-   * The batch_id and kern_id are packed into blockIdx.y. out_row and out_col
-   * are the threadIdx.x and threadIdx.y.
-   *
-   * Every thread block deals only with one image, and one filter kernel.
-   */
-  extern __shared__ float s_data[];
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
-    const int shared_row = threadIdx.y;
-    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
-
-  /*
-   * The kernel works by looping over channels (aka colours, aka the stack).
-   * On each iteration, a thread block loads one channel of all the image rows that
-   * it needs to use, and one channel slice of one kernel.
-   */
-    d_img=&s_data[0];//size of [(KERN_LEN+block_len-1) * IMAGE_WID];
-    d_kern=&s_data[(kern_len+nb_rows-1) * img_wid];//size of [KERNEL_LEN * KERNEL_WID];
-
-    float sum = 0.0f;
-    for (int stack = 0; stack < nstack; stack++){
-
-      int offset =
-          img_stride_batch * batch_id
-          + img_stride_stack * stack
-          //blockIdx.x is which chunk of nb_rows this thread block deals with
-          + img_stride_row * (blockIdx.x * nb_rows);
-
-      load_to_shared(
-              d_img,              // dst
-              img+offset,         // src
-              thread_id,          // linear position in block
-              nb_thread_id,       // number of threads
-              img_wid,            // cols in image to read
-              rows_to_read,       // number of rows to read
-              img_stride_col,     // img[i, j, k, l] to img[i, j, k, l + 1]
-              img_stride_row,     // img[i, j, k, l] to img[i, j, k + 1, l]
-              false,              // flip while reading
-              c_contiguous);
-
-      offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
-      load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
-                     kern_stride_col, kern_stride_row, true, c_contiguous);
-
-      __syncthreads();
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        const float* idx_kern=&d_kern[row*kern_wid];
-        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
-        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
-      }
-      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
-    }
-    if (out_row < out_len)
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
-__global__ void \
-conv_rows_stack_##suffix( \
-    const float *img, const size_t img_offset, \
-    const float *kern, const size_t kern_offset, \
-    float *out, const size_t out_offset, \
-    const int img_len, const int img_wid, \
-    const int kern_len, const int kern_wid, \
-    const int nkern, const int nstack, \
-    const int img_stride_col, const int img_stride_row, \
-    const int img_stride_stack, const int img_stride_batch, \
-    const int kern_stride_col, const int kern_stride_row, \
-    const int kern_stride_stack, const int kern_stride_nkern) \
-{ \
-    conv_rows_stack<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, \
-        nkern, nstack, img_stride_col, img_stride_row, \
-        img_stride_stack, img_stride_batch, \
-        kern_stride_col, kern_stride_row, \
-        kern_stride_stack, kern_stride_nkern); \
-}
-
-__INSTANTIATE_CONV_ROWS_STACK(0, false)
-__INSTANTIATE_CONV_ROWS_STACK(1, true)
-
-#undef __INSTANTIATE_CONV_ROWS_STACK
-}
-
-
-
-/**
- * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
- * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
- * we store block_len row of the image(at a time) and one or all kernel row in the shared memory
- * each thread compute only one value for the output
- * thread block size=out_wid, block_len
- * grid block size=intceil(out_len/block_len),nb_batch*nb_kern
- * dynamic shared memory: block_len * img_wid+(preload_full_kern?kern_len:1)*kern_wid
- * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
- *    I.E. work for bigger image then conv_patch<split=true,...>.
- */
-template<bool c_contiguous, bool preload_full_kern>
-__device__ inline void
-conv_rows_stack2(const float* img, const size_t img_offset,
-                 const float* kern, const size_t kern_offset,
-                 float* out, const size_t out_offset,
-                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
-                 const int nkern, const int nstack,
-                 const int img_stride_col, const int img_stride_row,
-                 const int img_stride_stack, const int img_stride_batch,
-                 const int kern_stride_col, const int kern_stride_row,
-                 const int kern_stride_stack, const int kern_stride_nkern)
-{
-  int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
-  float  __shared__ *d_img, *d_kern;
-
-  kern = (const float *)(((const char *)kern)+kern_offset);
-  img = (const float *)(((const char *)img)+img_offset);
-  out = (float *)(((char *)out)+out_offset);
-
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
-  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
-  batch_id= blockIdx.y/nkern;
-  kern_id = blockIdx.y%nkern;
-  nb_rows = blockDim.y;
-
-  extern __shared__ float s_data[];
-
-    const int out_col = threadIdx.x;//output col
-    const int out_row = blockIdx.x*blockDim.y+threadIdx.y;//output row
-    const int shared_row = threadIdx.y;
-    const int thread_id = threadIdx.y*blockDim.x+threadIdx.x;
-
-    d_img=&s_data[0];//size of [nb_rows * IMAGE_WID];
-    d_kern=&s_data[nb_rows*img_wid];//size of [(preload_full_kern?KERNEL_LEN:1) * KERNEL_WID];
-    
-    float sum = 0.0f;
-    for (int stack = 0;stack<nstack;stack++){
-
-      int _idx2=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-      _idx2+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-    
-      __syncthreads();
-      load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
-                           img_stride_col, img_stride_row, false, c_contiguous);
-      if(preload_full_kern)
-        load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
-                       thread_id, nb_thread_id, kern_wid,kern_len,
-                       kern_stride_col, kern_stride_row, true, c_contiguous);
-      __syncthreads();
-
-      for (int row=0; row < kern_len; row++) {//loop over row
-        __syncthreads();
-        if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
-          int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
-          _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
-          _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
-          load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
-                         img+_idx1, thread_id, nb_thread_id, img_wid, 1,
-                         img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
-        }
-
-        if(!preload_full_kern){
-          int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
-          _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
-          load_to_shared(d_kern, kern+_idx3,
-                         thread_id, nb_thread_id, kern_wid,1,
-                         kern_stride_col, kern_stride_row, true, c_contiguous);
-
-        }
-        __syncthreads();
-
-        //if needed as on Fermi as reading out of bound index from shared memory generate an error.
-        //Not needed on generation before as they worked anyway. Removing the if generate the good code
-        //as we store the result of only the good thread.
-        //This was with nvcc 3.0 on an GTX470 card.
-        if(out_row<out_len){
-          const float* idx_kern;
-          if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
-          else idx_kern=d_kern;
-          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
-          float sum_ =0.0f;
-          convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
-          sum+=sum_;//We pass by an intermediate variable to have more precission.
-        }
-      }
-    }
-    __syncthreads();
-    if(out_row<out_len)
-      out[batch_id*out_wid*out_len*nkern+//the good batch
-          kern_id*out_wid*out_len+//the output image
-          out_row*out_wid+out_col] = sum;
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
-__global__ void \
-conv_rows_stack2_##suffix( \
-    const float *img, const size_t img_offset, \
-    const float *kern, const size_t kern_offset, \
-    float *out, const size_t out_offset, \
-    const int img_len, const int img_wid, \
-    const int kern_len, const int kern_wid, \
-    const int nkern, const int nstack, \
-    const int img_stride_col, const int img_stride_row, \
-    const int img_stride_stack, const int img_stride_batch, \
-    const int kern_stride_col, const int kern_stride_row, \
-    const int kern_stride_stack, const int kern_stride_nkern) \
-{ \
-    conv_rows_stack2<__VA_ARGS__>( \
-        img, img_offset, kern, kern_offset, out, out_offset, \
-        img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
-        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
-        kern_stride_col, kern_stride_row, \
-        kern_stride_stack, kern_stride_nkern); \
-}
-
-__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
-__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
-__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
-__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
-
-#undef __INSTANTIATE_CONV_ROWS_STACK2
-}
-
-
-
-/**
- * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
- * kernel-image inner-product in parallel.
- * 
- * This implementation uses shared memory for the reduce, so it is limited by the product of stacklen x kern_len
- *
- * template stack_loop: if true, we accept that blockDim.x < nstack and we add a loop for this(use 3 more registers, so lower occupency when true, but accept nstack*kern_len>512)
- * TODO: explain parameters, preconditions
- */
-template<bool stack_loop>
-__device__ inline void
-conv_valid_row_reduce(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical
-        const float *img, const size_t img_offset, int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, const size_t kern_offset, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, const size_t out_offset, int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
-        int subsample_rows, int subsample_cols,
-        const int initial_reduce_boundary)
-{
-    const int outsize = nB * nK * out_len * out_wid;
-
-    kern = (const float *)(((const char *)kern)+kern_offset);
-    img = (const float *)(((const char *)img)+img_offset);
-    out = (float *)(((char *)out)+out_offset);
-
-    extern __shared__ float reducebuf[];
-    for (int i = blockIdx.x; i < /*physical*/outsize; i += gridDim.x)
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-
-        int ss = threadIdx.x;
-        int rr = threadIdx.y;
-        int img_rr = iR_logical + kern_len - 1 - rr;
-        int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
-        float sum = 0.0f;
-        if(stack_loop){
-          for (; ss < stacklen; ss+=blockDim.x){
-            const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-            const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-            for (int cc = 0; cc < kern_wid; ++cc)
-            {
-                sum +=  kk_0[0] * ii_0[0];
-                kk_0 += kern_str_C;
-                ii_0 -= img_str_C;
-            }
-          }
-        }else{
-          const float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
-          const float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
-          for (int cc = 0; cc < kern_wid; ++cc)
-          {
-            sum +=  kk_0[0] * ii_0[0];
-            kk_0 += kern_str_C;
-            ii_0 -= img_str_C;
-          }
-        }
-
-        if (blockDim.x * blockDim.y == 1)
-        {
-            out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-        }
-        else
-        {
-            reducebuf[reduceIdx] = sum;
-            __syncthreads();
-            int reduce_boundary = initial_reduce_boundary;
-
-            // add in the terms above the reduce boundary
-            if (reduceIdx + reduce_boundary < (blockDim.x * blockDim.y))
-                reducebuf[reduceIdx] += reducebuf[reduce_boundary +reduceIdx];
-            reduce_boundary >>= 1;
-            // there are an equal number of terms above and below the reduce_boundary
-            while (reduce_boundary)
-            {
-                __syncthreads();
-                if (reduceIdx < reduce_boundary)
-                {
-                    reducebuf[reduceIdx] += reducebuf[reduce_boundary + reduceIdx];
-                }
-                reduce_boundary >>= 1;
-            }
-            if (reduceIdx == 0)
-            {
-                out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = reducebuf[0];
-            }
-        }
-    }
-}
-
-extern "C" {
-#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
-__global__ void \
-conv_valid_row_reduce_##suffix( \
-    int nB, int nK, int stacklen, int img_len, int img_wid,  \
-    int kern_len, int kern_wid, int out_len, int out_wid, \
-    const float *img, const size_t img_offset, \
-    int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
-    const float *kern, const size_t kern_offset, \
-    int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
-    float *out, const size_t out_offset, \
-    int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
-    int subsample_rows, int subsample_cols, \
-    const int initial_reduce_boundary) \
-{ \
-    conv_valid_row_reduce<__VA_ARGS__>( \
-        nB, nK, stacklen, img_len, img_wid, \
-        kern_len, kern_wid, out_len, out_wid, \
-        img, img_offset, img_str_B, img_str_S, img_str_R, img_str_C, \
-        kern, kern_offset, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
-        out, out_offset, out_str_B, out_str_K, out_str_R, out_str_C, \
-        subsample_rows, subsample_cols, initial_reduce_boundary); \
-}
-
-__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
-__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
-
-#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
-}
-
-
-
-/**
- * Reference implementation of 'valid' mode convolution (with stack)
- * 
- * This implementation works for any size of image and kernel.  It does not use shared memory.
- *
- * TODO: explain parameters, preconditions
- */
-extern "C" __global__ void
-conv_reference_valid(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical
-        const float *img, const size_t img_offset,
-        int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, const size_t kern_offset,
-        int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, const size_t out_offset,
-        int out_str_B, int out_str_K, int out_str_R, int out_str_C ,
-        int subsample_rows, int subsample_cols)
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ int numThreads, outsize;
-
-    kern = (const float *)(((const char *)kern)+kern_offset);
-    img = (const float *)(((const char *)img)+img_offset);
-    out = (float *)(((char *)out)+out_offset);
-
-    numThreads = blockDim.x * gridDim.x;
-    outsize = nB * nK * out_len * out_wid;
-
-    for (int i = idx; i < outsize; i += numThreads)  //physical
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-
-        float sum = 0.0f;
-        for (int ss = 0; ss < stacklen; ++ss)
-        {
-            for (int rr = 0; rr < kern_len; ++rr)
-            {
-                int img_rr = iR_logical + kern_len - 1 - rr;
-                for (int cc = 0; cc < kern_wid; ++cc)
-                {
-                    int img_cc = iC_logical + kern_wid-1-cc;
-                    float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
-                    float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
-                    sum +=  k_0 * i_0;
-                }
-            }
-        }
-        //coords[i*5+0] = iB;
-        //coords[i*5+1] = iK;
-        //coords[i*5+2] = iR;
-        //coords[i*5+3] = iC;
-        //coords[i*5+4] = iB * out_str_B + iK * out_str_K + iR * out_str_R + iC * out_str_C;
-        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-    }
-}
-
-
-
-/**
- * Reference implementation of 'full' mode convolution (with stack)
- * 
- * This implementation works for any size of image and kernel.  It does not use shared memory.
- *
- * TODO: explain parameters, preconditions
- */
-extern "C" __global__ void
-conv_reference_full(int nB, int nK, int stacklen,
-        int img_len, int img_wid, 
-        int kern_len, int kern_wid,
-        int out_len, int out_wid, //physical dimensions
-        const float *img, const size_t img_offset,
-        int img_str_B, int img_str_S, int img_str_R, int img_str_C,
-        const float *kern, const size_t kern_offset,
-        int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C,
-        float *out, const size_t out_offset,
-        int out_str_B, int out_str_K, int out_str_R, int out_str_C,
-        int subsample_rows, int subsample_cols)
-{
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    __shared__ int numThreads, physical_outsize;
-
-    kern = (const float *)(((const char *)kern)+kern_offset);
-    img = (const float *)(((const char *)img)+img_offset);
-    out = (float *)(((char *)out)+out_offset);
-
-    numThreads = blockDim.x * gridDim.x;
-    physical_outsize = nB * nK * out_len * out_wid;
-
-    for (int i = idx; i < physical_outsize; i += numThreads) 
-    {
-        //figure out what output element we're in charge of computing
-        int ii = i;
-        int iB = ii % nB;      // output batch index
-        ii = ii / nB;
-        int iK = ii % nK;      // output kernel index
-        ii = ii / nK;
-        int iR_physical = ii % out_len; //output kernel row
-        int iC_physical = ii / out_len; // output kernel column
-        int iR_logical = iR_physical * subsample_rows;
-        int iC_logical = iC_physical * subsample_cols;
-
-        float sum = 0.0f;
-        for (int ss = 0; ss < stacklen; ++ss)
-        {
-            for (int rr = 0; rr < kern_len; ++rr)
-            {
-                int img_rr = iR_logical - rr;
-                if ((img_rr >= 0) && (img_rr < img_len))
-                {
-                    for (int cc = 0; cc < kern_wid; ++cc)
-                    {
-                        int img_cc = iC_logical - cc;
-                        if ((img_cc >= 0) && (img_cc < img_wid))
-                        {
-                            float k_0 = kern[iK*kern_str_K + ss*kern_str_S + rr*kern_str_R + cc*kern_str_C];
-                            float i_0 = img[iB*img_str_B + ss*img_str_S + img_rr*img_str_R + img_cc*img_str_C];
-                            sum +=  k_0 * i_0;
-                        }
-                    }
-                }
-            }
-        }
-        out[iB * out_str_B + iK * out_str_K + iR_physical * out_str_R + iC_physical * out_str_C] = sum;
-    }
-}
-
-#endif // #ifndef CONV_KERNEL_CU
-/*
-  Local Variables:
-  mode:c++
-  c-basic-offset:4
-  c-file-style:"stroustrup"
-  indent-tabs-mode:nil
-  fill-column:79
-  End:
-*/
-// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
--- a/theano/sandbox/gpuarray/dnn.py
+++ b/theano/sandbox/gpuarray/dnn.py
@@ -5,6 +5,7 @@ import warnings
 import theano
 from theano import Op, Apply, tensor, config, Variable
 from theano.scalar import as_scalar, constant, Log
+from theano.tensor import as_tensor_variable
 from theano.gradient import DisconnectedType, grad_not_implemented
 from theano.gof import Optimizer, local_optimizer, COp
 from theano.gof.cmodule import GCC_compiler
@@ -12,17 +13,19 @@ from theano.gof.type import CDataType, Generic
 from theano.compile import optdb
 from theano.compile.ops import shape_i
 from theano.tensor.nnet import SoftmaxGrad
-from theano.tensor.nnet.abstract_conv import get_conv_output_shape
-from theano.tensor.signal.downsample import (
-    DownsampleFactorMax, MaxPoolGrad, AveragePoolGrad)
+from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
+                                              AbstractConv2d_gradWeights,
+                                              AbstractConv2d_gradInputs,
+                                              get_conv_output_shape)
+from theano.tensor.signal.downsample import (DownsampleFactorMax,
+                                             MaxPoolGrad, AveragePoolGrad)

 from . import pygpu
-from .type import get_context, gpu_context_type, list_contexts
+from .type import get_context, gpu_context_type, list_contexts, GpuArrayType
 from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        gpu_contiguous, HostFromGpu,
                        GpuAllocEmpty, empty_like)
 from .elemwise import GpuElemwise
-from .conv import GpuConv

 # These don't exist in gpuarray
 # GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
@@ -819,6 +822,30 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
    return GpuDnnConv(algo=algo)(img, kerns, out, desc)


+def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
+                   subsample=(1, 1), conv_mode='conv'):
+    ctx_name = infer_context_name(img, topgrad)
+    img = gpu_contiguous(img)
+    topgrad = gpu_contiguous(topgrad)
+    kerns_shp = as_tensor_variable(kerns_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(kerns_shp)
+    out = GpuAllocEmpty(img.dtype, ctx_name)(*kerns_shp)
+    return GpuDnnConvGradW()(img, topgrad, out, desc)
+
+
+def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
+                  subsample=(1, 1), conv_mode='conv'):
+    ctx_name = infer_context_name(kerns, topgrad)
+    kerns = gpu_contiguous(kerns)
+    topgrad = gpu_contiguous(topgrad)
+    img_shp = as_tensor_variable(img_shp)
+    desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
+                          conv_mode=conv_mode)(kerns.shape)
+    out = GpuAllocEmpty(kerns.dtype, ctx_name)(*img_shp)
+    return GpuDnnConvGradI()(kerns, topgrad, out, desc)
+
+
 class GpuDnnPoolDesc(Op):
    """
    This Op builds a pooling descriptor for use in the other
@@ -1188,57 +1215,53 @@ class GpuDnnSoftmaxGrad(GpuDnnSoftmaxBase):
        return Apply(self, [dy, sm], [sm.type()])


-# @register_opt('cudnn')  # this optimizer is registered in opt.py instead.
-@local_optimizer([GpuConv])
-def local_conv_dnn(node):
-    if isinstance(node.op, GpuConv):
-        if not dnn_available(node.outputs[0].type.context_name):
-            return
-        if node.op.border_mode not in ['full', 'valid']:
-            return
-        img, kern = node.inputs
-        border_mode = node.op.border_mode
-        subsample = node.op.subsample
-        direction_hint = node.op.direction_hint
-        rval = dnn_conv(img, kern,
-                        border_mode=border_mode, subsample=subsample,
-                        direction_hint=direction_hint)
-        return [rval]
-
-
-# This optimizer is registered in opt.py as part of the meta-optimizer.
-# It tries exactly the opposite code path of what local_conv_dnn() uses,
-# because for some input/kernel shape configurations, this is faster.
-@local_optimizer([GpuConv])
-def local_conv_dnn_alternative(node):
-    if isinstance(node.op, GpuConv):
-        if not dnn_available(node.outputs[0].type.context_name):
-            return
-        border_mode = node.op.border_mode
-        subsample = node.op.subsample
-        if border_mode not in ['full', 'valid'] or subsample != (1, 1):
-            return
-        img, kern = node.inputs
-        direction_hint = node.op.direction_hint
-        if border_mode == 'full':
-            # for a full convolution, try using the forward pass instead
-            # of the backward pass wrt. inputs
-            direction_hint = 'forward!'
-        elif border_mode == 'valid':
-            # for a valid convolution, try using the backward pass wrt.
-            # weights instead of the forward pass and vice versa
-            if direction_hint == 'bprop weights':
-                direction_hint = 'forward'
-            else:
-                direction_hint = 'bprop weights'
-        rval = dnn_conv(img, kern,
-                        border_mode=border_mode, subsample=subsample,
-                        direction_hint=direction_hint)
-        return [rval]
+@local_optimizer([AbstractConv2d, AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_abstractconv_cudnn(node):
+    if (not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
+                                 AbstractConv2d_gradInputs))):
+        return None
+    inp1 = node.inputs[0]
+    inp2 = node.inputs[1]

+    if (not isinstance(inp1.type, GpuArrayType) or
+            not isinstance(inp2.type, GpuArrayType)):
+        return None
+
+    if not dnn_available(inp1.type.context_name):
+        return None

-conv_groupopt.register('local_conv_dnn', local_conv_dnn, 20,
-                       'conv_dnn', 'fast_compile', 'fast_run', 'cudnn')
+    if node.op.filter_flip:
+        conv_mode = 'conv'
+    else:
+        conv_mode = 'cross'
+
+    if isinstance(node.op, AbstractConv2d):
+        rval = dnn_conv(inp1, inp2,
+                        border_mode=node.op.border_mode,
+                        subsample=node.op.subsample,
+                        direction_hint='forward!',
+                        conv_mode=conv_mode)
+    if isinstance(node.op, AbstractConv2d_gradWeights):
+        shape = (inp2.shape[1], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradweight(inp1, inp2, shape,
+                              border_mode=node.op.border_mode,
+                              subsample=node.op.subsample,
+                              conv_mode=conv_mode)
+    if isinstance(node.op, AbstractConv2d_gradInputs):
+        shape = (inp2.shape[0], inp1.shape[1],
+                 node.inputs[2][0], node.inputs[2][1])
+        rval = dnn_gradinput(inp1, inp2, shape,
+                             border_mode=node.op.border_mode,
+                             subsample=node.op.subsample,
+                             conv_mode=conv_mode)
+    return [rval]
+
+conv_groupopt.register('local_abstractconv_cudnn',
+                       local_abstractconv_cudnn, 20,
+                       'fast_compile', 'fast_run',
+                       'gpuarray', 'conv_dnn', 'cudnn')


 @inplace_allocempty(GpuDnnConv, 2)

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -14,7 +14,13 @@ from theano.gof.optdb import LocalGroupDB
 from theano.scalar.basic import Scalar, Pow, Cast
 from theano.scan_module import scan_utils, scan_op, scan_opt

+from theano.tensor import as_tensor_variable
 from theano.tensor.nnet.conv import ConvOp
+from theano.tensor.nnet.abstract_conv import (BaseAbstractConv2d,
+                                              AbstractConv2d,
+                                              AbstractConv2d_gradWeights,
+                                              AbstractConv2d_gradInputs)
+
 from theano.tests.breakpoint import PdbBreakpoint

 from .type import (GpuArrayType, GpuArrayConstant, get_context,
@@ -27,7 +33,6 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        GpuEye, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer,
                   gpugemm_no_inplace)
-from .conv import GpuConv
 from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
@@ -786,77 +791,49 @@ def local_assert(node, context_name):

 @register_opt('fast_compile')
 @op_lifter([ConvOp])
-def local_gpu_conv(node, context_name):
-    def GpuConvOp_from_ConvOp(op):
-        logical_img_hw = None
-
-        if op.kshp_logical is not None and op.kshp_logical != op.kshp:
-            return None
-
-        ret = GpuConv(border_mode=op.out_mode,
-                      subsample=(op.dx, op.dy),
-                      logical_img_hw=logical_img_hw,
-                      logical_kern_hw=op.kshp_logical,
-                      logical_kern_align_top=op.kshp_logical_top_aligned,
-                      kshp=op.kshp,
-                      version=op.version,
-                      direction_hint=op.direction_hint,
-                      verbose=op.verbose,
-                      imshp=op.imshp,
-                      nkern=op.nkern,
-                      bsize=op.bsize,
-                      fft_opt=op.fft_opt)
-        if op.imshp_logical is not None:
-            logical_img_hw = op.imshp_logical[1:3]
-            if logical_img_hw != op.imshp[1:3]:
-                rstride = int(numpy.ceil(op.imshp_logical[1] /
-                                         float(op.imshp[1])))
-                cstride = int(numpy.ceil(op.imshp_logical[2] /
-                                         float(op.imshp[2])))
-
-                def make_graph(img, kern):
-                    buf = tensor.alloc(numpy.asarray(0, dtype=img.dtype),
-                                       img.shape[0], *op.imshp_logical)
-                    img = tensor.set_subtensor(buf[:, :, ::rstride, ::cstride],
-                                               img)
-                    img = GpuFromHost(context_name)(img)
-                    return ret(img, kern)
-
-                return make_graph
-        return ret
-
-    def values_eq_approx(a, b):
-        """
-        This fct is needed to don't have DebugMode raise useless
-        error due to ronding error.
-
-        This happen as We reduce on the two last dimensions, so this
-        can raise the absolute error if the number of element we
-        reduce on is significant.
-
-        """
-        assert a.ndim == 4
-        atol = None
-        if a.shape[-1] * a.shape[-2] > 100:
-            # For float32 the default atol is 1e-5
-            atol = 3e-5
-        return GpuArrayType.values_eq_approx(a, b, atol=atol)
-
-    img, kern = node.inputs
-    gpu_conv = GpuConvOp_from_ConvOp(node.op)
-    if gpu_conv is None:
-        return
-    out = gpu_conv(GpuFromHost(context_name)(img),
-                   GpuFromHost(context_name)(kern))
-    assert isinstance(out.type, GpuArrayType)
-    # Make sure to keep the broadcastable pattern of the original
-    # convolution even if we might gain or lose some due to different
-    # information at the node level.
-    out = tensor.patternbroadcast(out, node.outputs[0].broadcastable)
-    out.values_eq_approx = values_eq_approx
-    return [out]
-
-# Register this here so that it goes after 'local_gpu_conv'
+def local_error_convop(node, context_name):
+    assert False, """
+ConvOp does not work with the gpuarray backend.
+
+Use the new convolution interface to have GPU convolution working:
+theano.tensor.nnet.conv2d()
+"""
+
+
+# This deals with any abstract convs that have a transfer somewhere
+@register_opt('fast_compile')
+@op_lifter([AbstractConv2d,
+            AbstractConv2d_gradWeights,
+            AbstractConv2d_gradInputs])
+def local_lift_abstractconv2d(node, context_name):
+    inps = list(node.inputs)
+    inps[0] = as_gpuarray_variable(node.inputs[0],
+                                   context_name=context_name)
+    inps[1] = as_gpuarray_variable(node.inputs[1],
+                                   context_name=context_name)
+    return [node.op(*inps)]
+
+
+# This will deal with ops that don't have an explicit transfer but
+# have one of their inputs on the GPU already and the other not on the
+# GPU (to avoid endlessly replacing things).
+@register_opt('fast_compile')
+@local_optimizer([AbstractConv2d,
+                  AbstractConv2d_gradWeights,
+                  AbstractConv2d_gradInputs])
+def local_gpu_abstractconv2d(node):
+    if isinstance(node.op, BaseAbstractConv2d):
+        if ((isinstance(node.inputs[0].type, GpuArrayType) or
+             isinstance(node.inputs[1].type, GpuArrayType)) and
+            not (isinstance(node.inputs[0].type, GpuArrayType) or
+                 isinstance(node.inputs[1].type, GpuArrayType))):
+            inps = list(node.inputs)
+            ctx_name = infer_context_name(inps[0], inps[1])
+            inps[0] = as_gpuarray_variable(inps[0], context_name=ctx_name)
+            inps[1] = as_gpuarray_variable(inps[1], context_name=ctx_name)
+            return as_tensor_variable(node.op(*inps))
+
+# Register this here so that it goes after the abstract lifting
 register_opt()(conv_groupopt)



--- a/theano/sandbox/gpuarray/tests/test_abstractconv.py
+++ b/theano/sandbox/gpuarray/tests/test_abstractconv.py
+import unittest
+import numpy
+import itertools
+
+from nose.plugins.skip import SkipTest
+
+import theano
+from theano.tests import unittest_tools as utt
+import theano.tensor.nnet.abstract_conv as conv
+from theano.compile import shared as cpu_shared
+from ..type import gpuarray_shared_constructor as gpu_shared
+from ..dnn import dnn_available, dnn_conv, dnn_gradweight, dnn_gradinput
+
+from .config import mode_with_gpu, mode_without_gpu, test_ctx_name
+
+
+class TestConv2d(unittest.TestCase):
+
+    def setUp(self):
+        super(TestConv2d, self).setUp()
+        self.inputs_shapes = [(8, 1, 12, 12), (8, 1, 18, 18), (2, 1, 4, 4),
+                              (6, 1, 10, 11), (2, 1, 6, 5), (1, 5, 9, 9)]
+        self.filters_shapes = [(5, 1, 2, 2), (4, 1, 3, 3), (2, 1, 3, 3),
+                               (1, 1, 2, 5), (4, 1, 2, 2), (4, 5, 2, 2)]
+        self.subsamples = [(1, 1), (2, 2), (2, 4)]
+        self.border_modes = ["valid", "full", (0, 0), (1, 1), (5, 5), (5, 2)]
+        self.filter_flip = [True, False]
+
+    def get_output_shape(self, inputs_shape, filters_shape, subsample,
+                         border_mode):
+        if border_mode == "valid":
+            border_mode = (0, 0)
+        if border_mode == "full":
+            border_mode = (filters_shape[2] - 1, filters_shape[3] - 1)
+        batch_size = inputs_shape[0]
+        num_filters = filters_shape[0]
+        return ((batch_size, num_filters,) +
+                tuple(None if i is None or k is None
+                      else ((i + 2 * pad - k) // d + 1)
+                      for i, k, d, pad in zip(inputs_shape[2:],
+                                              filters_shape[2:],
+                                              subsample, border_mode)))
+
+    def run_fwd(self, inputs_shape, filters_shape, ref=dnn_conv,
+                subsample=(1, 1), verify_grad=True, mode=mode_without_gpu,
+                border_mode='valid', filter_flip=True, device='cpu',
+                provide_shape=False):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        if device == 'gpu':
+            inputs = gpu_shared(inputs_val)
+            filters = gpu_shared(filters_val)
+        else:
+            inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
+            filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
+        if provide_shape:
+            imshp = inputs_shape
+            kshp = filters_shape
+        else:
+            imshp = None
+            kshp = None
+        if filter_flip:
+            conv_mode = 'conv'
+        else:
+            conv_mode = 'cross'
+
+        c_ref = ref(inputs, filters,
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    conv_mode=conv_mode)
+        c = conv.conv2d(inputs, filters,
+                        border_mode=border_mode,
+                        subsample=subsample,
+                        filter_flip=filter_flip,
+                        input_shape=imshp,
+                        filter_shape=kshp)
+        f_ref = theano.function([], c_ref, mode=mode)
+        f = theano.function([], c, mode)
+        res_ref = numpy.array(f_ref())
+        res = numpy.array(f())
+        utt.assert_allclose(res_ref, res)
+        if verify_grad:
+            utt.verify_grad(conv.AbstractConv2d(border_mode="valid", imshp=imshp, kshp=kshp,
+                                                subsample=subsample),
+                            [inputs_val, filters_val],
+                            mode=mode)
+
+    def run_gradweight(self, inputs_shape, filters_shape, output_shape,
+                       ref=dnn_gradweight, subsample=(1, 1), filter_flip=True,
+                       verify_grad=True, mode=mode_without_gpu, border_mode='valid',
+                       device='cpu', provide_shape=False):
+
+        inputs_val = numpy.random.random(inputs_shape).astype('float32')
+        output_val = numpy.random.random(output_shape).astype('float32')
+        if device == 'gpu':
+            inputs = gpu_shared(inputs_val)
+            output = gpu_shared(output_val)
+        else:
+            inputs = theano.tensor.as_tensor_variable(cpu_shared(inputs_val))
+            output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
+        if provide_shape:
+            imshp = inputs_shape
+            kshp = filters_shape
+        else:
+            imshp = None
+            kshp = None
+        if filter_flip:
+            conv_mode = 'conv'
+        else:
+            conv_mode = 'cross'
+        c = conv.AbstractConv2d_gradWeights(border_mode=border_mode,
+                                            filter_flip=filter_flip,
+                                            subsample=subsample,
+                                            imshp=imshp, kshp=kshp)
+        c = c(inputs, output, filters_shape[-2:])
+        c_ref = ref(inputs, output,
+                    filters_shape,
+                    border_mode=border_mode,
+                    subsample=subsample,
+                    conv_mode=conv_mode)
+        f = theano.function([], c, mode)
+        f_ref = theano.function([], c_ref, mode)
+        res_ref = numpy.array(f_ref())
+        res = numpy.array(f())
+        utt.assert_allclose(res_ref, res)
+
+        def abstract_conv2d_gradweight(inputs_val, output_val):
+            conv_op = conv.AbstractConv2d_gradWeights(border_mode=border_mode, subsample=subsample)
+            return conv_op(inputs_val, output_val, filters_shape[-2:])
+
+        if verify_grad:
+            utt.verify_grad(abstract_conv2d_gradweight, [inputs_val, output_val],
+                            mode=mode, eps=1)
+
+    def run_gradinput(self, inputs_shape, filters_shape, output_shape, ref=dnn_gradinput,
+                      subsample=(1, 1), filter_flip=True, verify_grad=True, mode=mode_without_gpu,
+                      border_mode='valid', device='cpu', provide_shape=False):
+
+        output_val = numpy.random.random(output_shape).astype('float32')
+        filters_val = numpy.random.random(filters_shape).astype('float32')
+        if device == 'gpu':
+            output = gpu_shared(output_val)
+            filters = gpu_shared(filters_val)
+        else:
+            output = theano.tensor.as_tensor_variable(cpu_shared(output_val))
+            filters = theano.tensor.as_tensor_variable(cpu_shared(filters_val))
+        if provide_shape:
+            imshp = inputs_shape
+            kshp = filters_shape
+        else:
+            imshp = None
+            kshp = None
+        if filter_flip:
+            conv_mode = 'conv'
+        else:
+            conv_mode = 'cross'
+        c = conv.AbstractConv2d_gradInputs(border_mode=border_mode,
+                                           subsample=subsample,
+                                           filter_flip=filter_flip,
+                                           imshp=imshp, kshp=kshp)
+        c = c(filters, output, inputs_shape[-2:])
+        c_ref = ref(filters, output, inputs_shape,
+                    border_mode=border_mode, subsample=subsample,
+                    conv_mode=conv_mode)
+        f = theano.function([], c, mode)
+        f_ref = theano.function([], c_ref, mode)
+        res_ref = numpy.array(f_ref())
+        res = numpy.array(f())
+        utt.assert_allclose(res_ref, res)
+
+        def abstract_conv2d_gradinputs(filters_val, output_val):
+            conv_op = conv.AbstractConv2d_gradInputs(border_mode=border_mode, subsample=subsample)
+            return conv_op(filters_val, output_val, inputs_shape[-2:])
+        if verify_grad:
+            utt.verify_grad(abstract_conv2d_gradinputs, [filters_val, output_val],
+                            mode=mode, eps=1)
+
+    def test_dnn_conv(self):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+        mode = mode_with_gpu
+        # provide_shape is not used by the CuDNN impementation
+        provide_shape = False
+
+        for (i, f), s, b, flip in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip):
+            o = self.get_output_shape(i, f, s, b)
+            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                         verify_grad=True, mode=mode, device='gpu',
+                         provide_shape=provide_shape, border_mode=b,
+                         filter_flip=flip)
+            self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                output_shape=o, subsample=s,
+                                verify_grad=True, mode=mode, device='gpu',
+                                provide_shape=provide_shape, border_mode=b,
+                                filter_flip=flip)
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode, device='gpu',
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip)
+
+    def test_cormm_conv(self):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+
+        mode = mode_without_gpu
+        for (i, f), s, b, flip, provide_shape in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip,
+                [False, True]):
+
+            o = self.get_output_shape(i, f, s, b)
+            self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                         verify_grad=True, mode=mode, device='cpu',
+                         provide_shape=provide_shape, border_mode=b,
+                         filter_flip=flip)
+            self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                output_shape=o, subsample=s,
+                                verify_grad=True, mode=mode, device='cpu',
+                                provide_shape=provide_shape, border_mode=b,
+                                filter_flip=flip)
+            self.run_gradinput(inputs_shape=i, filters_shape=f,
+                               output_shape=o, subsample=s,
+                               verify_grad=True, mode=mode, device='cpu',
+                               provide_shape=provide_shape, border_mode=b,
+                               filter_flip=flip)
+
+    def test_cpu_conv(self):
+        if not dnn_available(test_ctx_name):
+            raise SkipTest(dnn_available.msg)
+
+        mode = mode_without_gpu.excluding('conv_gemm')
+        for (i, f), s, b, flip, provide_shape in itertools.product(
+                zip(self.inputs_shapes, self.filters_shapes),
+                self.subsamples,
+                self.border_modes,
+                self.filter_flip,
+                [False, True]):
+
+            o = self.get_output_shape(i, f, s, b)
+            fwd_OK = True
+            gradweight_OK = True
+            gradinput_OK = True
+
+            if not flip:
+                fwd_OK = False
+                gradweight_OK = False
+                gradinput_OK = False
+
+            if b not in ('valid', 'full'):
+                fwd_OK = False
+                gradweight_OK = False
+                gradinput_OK = False
+
+            if (not provide_shape) and (s != (1, 1)) and (b == 'full'):
+                gradweight_OK = False
+                gradinput_OK = False
+
+            if ((s[0] not in (1, 2)) or (s[1] not in (1, 2))) and (b == 'full'):
+                gradweight_OK = False
+                gradinput_OK = False
+
+            if fwd_OK:
+                self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
+                             verify_grad=True, mode=mode, device='cpu',
+                             provide_shape=provide_shape, border_mode=b,
+                             filter_flip=flip)
+            else:
+                self.assertRaises(NotImplementedError,
+                                  self.run_fwd,
+                                  inputs_shape=i,
+                                  filters_shape=f,
+                                  subsample=s,
+                                  verify_grad=False,
+                                  mode=mode,
+                                  device='cpu',
+                                  provide_shape=provide_shape,
+                                  border_mode=b,
+                                  filter_flip=flip)
+
+            if gradweight_OK:
+                self.run_gradweight(inputs_shape=i, filters_shape=f,
+                                    output_shape=o, subsample=s,
+                                    verify_grad=False, mode=mode, device='cpu',
+                                    provide_shape=provide_shape, border_mode=b,
+                                    filter_flip=flip)
+            else:
+                self.assertRaises(NotImplementedError,
+                                  self.run_gradweight,
+                                  inputs_shape=i,
+                                  filters_shape=f,
+                                  output_shape=o,
+                                  subsample=s,
+                                  verify_grad=False,
+                                  mode=mode,
+                                  device='cpu',
+                                  provide_shape=provide_shape,
+                                  border_mode=b,
+                                  filter_flip=flip)
+
+            if gradinput_OK:
+                self.run_gradinput(inputs_shape=i, filters_shape=f,
+                                   output_shape=o, subsample=s,
+                                   verify_grad=False, mode=mode, device='cpu',
+                                   provide_shape=provide_shape, border_mode=b,
+                                   filter_flip=flip)
+            else:
+                self.assertRaises(NotImplementedError,
+                                  self.run_gradinput,
+                                  inputs_shape=i,
+                                  filters_shape=f,
+                                  output_shape=o,
+                                  subsample=s,
+                                  verify_grad=False,
+                                  mode=mode,
+                                  device='cpu',
+                                  provide_shape=provide_shape,
+                                  border_mode=b,
+                                  filter_flip=flip)
--- a/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/gpuarray/tests/test_conv_cuda_ndarray.py
-"""
-Tests for GPU convolution
-"""
-from __future__ import print_function
-import sys
-import time
-import unittest
-
-import numpy
-from six.moves import xrange
-
-import theano
-from theano import tensor
-from theano.tests.unittest_tools import seed_rng
-
-# We let that import do the init of the back-end if needed.
-from .config import mode_with_gpu, test_ctx_name
-from ..type import GpuArrayType, get_context
-from ..conv import GpuConv
-from theano.sandbox.gpuarray import dnn
-
-import pygpu
-
-imported_scipy_convolve2d = False
-try:
-    from scipy.signal import convolve2d
-    imported_scipy_convolve2d = True
-except ImportError:
-    pass
-
-gftensor4 = GpuArrayType('float32', [False] * 4, context_name=test_ctx_name)
-
-
-def py_conv_valid_numpy(img, kern):
-    assert img.shape[1] == kern.shape[1]
-    outshp = (img.shape[0], kern.shape[0],
-              img.shape[2] - kern.shape[2] + 1,
-              img.shape[3] - kern.shape[3] + 1)
-    out = numpy.zeros(outshp, dtype='float32')
-    for b in xrange(out.shape[0]):
-        for k in xrange(out.shape[1]):
-            for rr in xrange(out.shape[2]):
-                for cc in xrange(out.shape[3]):
-                    # rr, cc is the upper-left corner of img patches
-                    imgpatch = img[b, :, rr:rr + kern.shape[2],
-                                   cc:cc + kern.shape[3]]
-
-                    innerprod = (imgpatch[:, ::-1, ::-1] *
-                                 kern[k, :, :, :]).sum()
-                    out[b, k, rr, cc] = innerprod
-    return out
-
-
-def py_conv_full_numpy(img, kern):
-    # manually pad the img with zeros all around, and then run it
-    # through py_conv_valid
-    pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
-    pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
-    padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
-                             dtype=img.dtype)
-    padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
-               kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
-    return py_conv_valid_numpy(padded_img, kern)
-
-
-def py_conv(img, kern, mode, subsample):
-    """
-    use a scipy or numpy implementation depending is scipy is available.
-    The scipy version is faster.
-    """
-    if imported_scipy_convolve2d:
-        return py_conv_scipy(img, kern, mode, subsample)
-    elif mode == 'valid':
-        return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
-                                              ::subsample[1]]
-    elif mode == 'full':
-        return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
-                                             ::subsample[1]]
-    else:
-        raise Exception("Can't execute this kernel.")
-
-
-def py_conv_scipy(img, kern, mode, subsample):
-    assert img.shape[1] == kern.shape[1]
-    if mode == 'valid':
-        outshp = (img.shape[0], kern.shape[0],
-                  img.shape[2] - kern.shape[2] + 1,
-                  img.shape[3] - kern.shape[3] + 1)
-    else:
-        outshp = (img.shape[0], kern.shape[0],
-                  img.shape[2] + kern.shape[2] - 1,
-                  img.shape[3] + kern.shape[3] - 1)
-    out = numpy.zeros(outshp, dtype='float32')
-    for b in xrange(out.shape[0]):
-        for k in xrange(out.shape[1]):
-            for s in xrange(img.shape[1]):
-                out[b, k, :, :] += convolve2d(img[b, s, :, :],
-                                              kern[k, s, :, :],
-                                              mode)
-    return out[:, :, ::subsample[0], ::subsample[1]]
-
-
-def _params_allgood_header():
-    print("ishape kshape #Mflops CPU Mflops GPU Mflops Speedup")
-
-
-def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
-                    kern_stride=(1, 1), version=-1, verbose=0, random=True,
-                    print_=None, id=None, rtol=1e-5, atol=1e-8,
-                    nb_iter=0, ones=False, compile_kshp=None):
-    #
-    # This function is the core of several of the big unit-test drivers,
-    # but it can also be used very directly on its own to test a specific
-    # kind of convolution.
-    #
-    # See `test_example` (above) for an example of how to use this directly.
-    #
-    # :param kshape: (4d)The shape of the kernel at run time.
-    # :param compile_kshp: (2d) hardcode the shape of the kernel in
-    #                      the generated code This is supposed to be
-    #                      faster, but we need to check That we raise
-    #                      an error if the input have the wrong shape.
-    #
-    if ones:
-        assert not random
-        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
-        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
-    elif random:
-        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
-                                  dtype='float32')
-        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
-                                   dtype='float32')
-    else:
-        npy_img = theano._asarray(numpy.arange(
-            numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
-        npy_kern = -(theano._asarray(numpy.arange(
-            numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
-    img = pygpu.array(npy_img, context=get_context(test_ctx_name))
-    kern = pygpu.array(npy_kern, context=get_context(test_ctx_name))
-
-    # we take the stride after the transfert as we make c_contiguous
-    # data on the GPU.
-    if img_stride != (1, 1):
-        img = img[:, :, ::img_stride[0], ::img_stride[1]]
-        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
-    if kern_stride != (1, 1):
-        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
-        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
-
-    t2 = None
-    rval = True
-    try:
-        t0 = time.time()
-        cpuval = py_conv(npy_img, npy_kern, mode, subsample)
-        t1 = time.time()
-        i = gftensor4()
-        k = gftensor4()
-        op = GpuConv(border_mode=mode,
-                     subsample=subsample,
-                     version=version,
-                     verbose=verbose,
-                     kshp=compile_kshp)(i, k)
-        f = theano.function([i, k], op, mode=mode_with_gpu)
-        gpuval = f(img, kern)
-        t2 = time.time()
-        for i in range(nb_iter):
-            gpuval2 = f(img, kern)
-            assert numpy.allclose(numpy.asarray(gpuval),
-                                  numpy.asarray(gpuval2))
-            assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
-        gpuval = numpy.asarray(gpuval)
-        if gpuval.shape != cpuval.shape:
-            print("ERROR: shape mismatch", end=' ', file=sys.stdout)
-            print(gpuval.shape, cpuval.shape, file=sys.stdout)
-            rval = False
-        if rval:
-            rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
-            assert numpy.all(numpy.isfinite(gpuval))
-    except NotImplementedError as e:
-        print('_params_allgood Failed allclose', e, file=sys.stdout)
-        rval = False
-
-    if (t2 is not None):
-        if mode == 'valid':
-            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
-        else:
-            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
-                         kshape[3] * ishape[2] * ishape[3] * 2)
-        approx_fp /= 1e6
-        cpu_mflops = approx_fp / (t1 - t0)
-        gpu_mflops = approx_fp / (t2 - t1)
-        if verbose > 0:
-            print('%15s' % str(ishape), '%15s' % str(kshape), end=' ',
-                  file=sys.stdout)
-            print('%12.5f  %7.2f %7.2f %7.1f' %
-                  (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
-                  file=sys.stdout)
-    if not rval:
-        print('test_' + mode + ' id=' + str(id) +
-              ' FAILED for ishape, kshape, mode, subsample,' +
-              ' img_stride, kern_stride, version', ishape,
-              kshape, mode, subsample, img_stride, kern_stride,
-              version, file=sys.stdout)
-        diff = cpuval - gpuval
-        diffabs = numpy.absolute(diff)
-        pr_diff = diffabs / numpy.absolute(cpuval)
-        nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
-        print("max absolute diff:", (diffabs.max(), "avg abs diff:",
-                                     numpy.average(diffabs)))
-        print("median abs diff:", (numpy.median(diffabs), "nb close:",
-                                   nb_close, "/", diff.size))
-        print("max relatif diff:", (pr_diff.max(), "avg rel diff:",
-                                    numpy.average(pr_diff)))
-    if not rval and print_ is not False:
-        if npy_img.shape[0] > 5:
-            print("img", npy_img[0])
-            print("kern", npy_kern[0])
-            print("gpu", gpuval[0][0])
-            print("cpu", cpuval[0][0])
-            print("diff", diff[0][0])
-        else:
-            print("img", npy_img)
-            print("kern", npy_kern)
-            print("gpu", gpuval)
-            print("cpu", cpuval)
-            print("diff", diff)
-
-    return rval
-
-
-def exec_conv(version, shapes, verbose, random, mode,
-              print_=None, rtol=1e-5, ones=False):
-    if verbose > 0:
-        _params_allgood_header()
-    nb_failed = 0
-    nb_tests = 0
-
-    failed_version = set()
-    failed_id = []
-    # I put -1 in case we forget to add version in the test to.
-    for ver in version:
-        for id, (ishape, kshape, subshape,
-                 istride, kstride) in enumerate(shapes):
-            ret = False
-            try:
-                ret = _params_allgood(
-                    ishape,
-                    kshape,
-                    mode,
-                    subsample=subshape,
-                    img_stride=istride,
-                    kern_stride=kstride,
-                    version=ver,
-                    verbose=verbose,
-                    random=random,
-                    id=id,
-                    print_=print_,
-                    rtol=rtol,
-                    ones=ones)
-            except Exception as e:
-                print(ver, id, (ishape, kshape, subshape, istride, kstride))
-                print(e)
-                pass
-            if not ret:
-                failed_version.add(ver)
-                failed_id.append(id)
-                nb_failed += 1
-            nb_tests += 1
-    if nb_failed > 0:
-        print("nb_failed", nb_failed, "on", nb_tests, end=' ')
-        print("failed_version", failed_version, "failed_id", failed_id)
-        assert nb_failed == 0, nb_failed
-    else:
-        print('Executed', nb_tests, 'different shapes')
-
-
-def get_basic_shapes():
-    # basic test of image and kernel shape
-    return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            # basic test for unsquare kernel and image
-            ((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 4, 4), (1, 1, 3, 2), (1, 1), (1, 1), (1, 1)),
-            ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1))]
-
-
-def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
-               img_stride=(1, 1), kern_stride=(1, 1)):
-    """ all possible case if we one or more of stack size, batch size,
-    nkern. We use the gived image shape, kernel shape and subsmaple
-    shape."""
-    return [
-        # stack only
-        ((1, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        # batch only
-        ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
-        # nkern only
-        ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        # batch and nkern
-        ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
-        # batch and stack
-        ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
-        # stack and nkern
-        ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        # batch, nkern and stack
-        ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
-        # batch, nkern and stack
-        ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
-        ]
-
-
-def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
-                img_stride=(1, 1), kern_stride=(1, 1)):
-    # basic test of stack, batch and nkern paramter
-    shapes = get_shapes((1 * scales_img[0], 1 * scales_img[1]),
-                        (1 * scales_kern[0], 1 * scales_kern[1]),
-                        subsample, img_stride, kern_stride)
-    # basic test of stack, batch and nkern paramter with image and kernel shape
-    shapes += get_shapes((2 * scales_img[0], 2 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    # basic test of stack, batch and nkern paramter with image and kernel shape
-    shapes += get_shapes((3 * scales_img[0], 3 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    # basic test of stack, batch and nkern paramter with not square image.
-    shapes += get_shapes((4 * scales_img[0], 3 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    # basic test of stack, batch and nkern paramter with not square image.
-    shapes += get_shapes((3 * scales_img[0], 4 * scales_img[1]),
-                         (2 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    # basic test of stack, batch and nkern paramter with not square kernel.
-    shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
-                         (3 * scales_kern[0], 2 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    # basic test of stack, batch and nkern paramter with not square kernel.
-    shapes += get_shapes((4 * scales_img[0], 4 * scales_img[1]),
-                         (2 * scales_kern[0], 3 * scales_kern[1]),
-                         subsample, img_stride, kern_stride)
-    return shapes
-
-
-def get_valid_shapes():
-    #          img shape,     kern shape, subsample shape
-
-    shapes = get_basic_shapes()
-    shapes += get_shapes2()
-
-    # test image stride
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
-    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
-
-    # test subsample done in a separate fct
-
-    shapes += [
-        # other test
-        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
-        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
-        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
-        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
-        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
-        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights
-        ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)),  # added a test case that fail from test_nnet.py.test_conv_nnet2
-        ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)),  # test precedent but reduced that triger the error
-        # Test more than maxThreadsDim0
-        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-        ]
-
-    shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 1 layers
-               ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 2 layers
-               ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
-               ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 2 valid
-               ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 1 layers
-               ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 2 layers
-               ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
-               ]
-    return shapes
-
-
-def test_valid():
-    seed_rng()
-    shapes = get_valid_shapes()
-
-    version = [-1]
-    verbose = 0
-
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-
-    exec_conv(version, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones, rtol=1.1e-5)
-
-
-def test_full():
-    seed_rng()
-    shapes = get_basic_shapes()
-    shapes += get_shapes2()
-    # test image stride
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
-    shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
-    shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
-
-    # test subsample done in a separate fct
-
-    shapes += [
-        # other test
-        ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-        ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
-        ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-        ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
-        ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize
-        ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image
-        ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize, non-square image, non-square kern
-        ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)),  # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
-        ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)),  # a big one
-        ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # MNIST LeNET layer 1
-        ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)),  # layer 1 backprop to weights
-
-        # other test
-        ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)),  # kernel bigger then image
-        ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
-        ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
-        ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)),  # kernel bigger then image
-        ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)),
-        ]
-    shapes += [
-        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # test_lenet_28 bprop 1 full
-        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # test_lenet_64 full
-        # Test more than maxThreadsDim0
-        ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-        ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
-        ]
-
-    version = [-1]
-    verbose = 0
-    random = True
-
-    exec_conv(version, shapes, verbose, random, 'full')
-
-
-def test_subsample():
-    seed_rng()
-    # implement when
-    shapes = [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
-              ((1, 1, 1, 1), (1, 1, 1, 1), (2, 2), (1, 1), (1, 1)),
-              ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
-              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
-              ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
-              ]
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
-    shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
-
-    version_valid = [-1]
-    version_full = [-1]
-    verbose = 0
-    random = True
-    print_ = False
-    ones = False
-    if ones:
-        random = False
-
-    exec_conv(version_valid, shapes, verbose, random, 'valid',
-              print_=print_, ones=ones)
-    exec_conv(version_full, shapes, verbose, random, 'full',
-              print_=print_, ones=ones)
-
-
-class TestConv2DGPU(unittest.TestCase):
-    conv_ops = (GpuConv,
-                dnn.DnnBase)
-
-    def test_logical_shapes(self):
-        seed_rng()
-        for stride in range(1, 4):
-            kshp = (10, 2, 10, 10)
-            featshp = (3, 10, 11, 11)
-
-            a = tensor.ftensor4()
-            A = tensor.ftensor4()
-
-            # Need to transpose first two dimensions of kernel, and reverse
-            # index kernel image dims (for correlation)
-            kernel_rotated = tensor.transpose(A, axes=[1, 0, 2, 3])
-
-            featshp_logical = (featshp[0], featshp[1], featshp[2] * stride,
-                               featshp[3] * stride)
-            kshp_rotated = (kshp[1], kshp[0], kshp[2], kshp[3])
-            # print featshp, kshp_rotated, featshp_logical[1:], kshp[2:]
-            image_estimate = tensor.nnet.conv2d(a, kernel_rotated,
-                                                border_mode='full',
-                                                image_shape=featshp,
-                                                filter_shape=kshp_rotated,
-                                                imshp_logical=featshp_logical[1:],
-                                                kshp_logical=kshp[2:])
-
-            func = theano.function([a, A], image_estimate, mode=mode_with_gpu)
-            # theano.printing.debugprint(func,)
-            assert any([isinstance(node.op, self.conv_ops)
-                        for node in func.maker.fgraph.toposort()])
-
-            a_in = numpy.random.randn(*featshp).astype("float32")
-            A_in = numpy.random.randn(*kshp).astype("float32")
-
-            func(a_in, A_in)
-
-    def test_invalid_input_shape(self):
-        """
-        Tests that when the shape gived at build time is not the same as
-        run time we raise an error
-        """
-        seed_rng()
-        verbose = 0
-        random = True
-        print_ = False
-        ones = False
-        if ones:
-            random = False
-
-        global mode_with_gpu
-        mode_with_gpu_orig = mode_with_gpu
-        try:
-            if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
-                mode_with_gpu = theano.compile.mode.get_mode(
-                    'FAST_RUN').including('gpu')
-                for mode in ['valid', 'full']:
-                    for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
-                                   ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
-                                   # We use only the number of columns.
-                                   ]:
-
-                        self.assertRaises(ValueError, _params_allgood,
-                                          shapes[0], shapes[1],
-                                          verbose=verbose, random=random,
-                                          mode=mode,
-                                          print_=print_, ones=ones,
-                                          compile_kshp=shapes[2])
-        finally:
-            mode_with_gpu = mode_with_gpu_orig
-
-
-def benchmark():
-
-    shapes_valid = [
-        # test_lenet_28 shape
-        ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)),  # valid
-        ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)),  # valid
-        # test_lenet_32 shape
-        ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)),  # valid
-        ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)),  # valid
-        ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)),  # valid
-        # test_lenet_64 shape
-        ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)),  # valid
-        ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)),  # valid
-        # test_lenet_108 shape
-        ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)),  # valid
-        ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)),  # valid
-        ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)),  # valid
-        # test_lenet_256 shape
-        ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
-        ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)),  # valid
-        ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)),  # valid
-        ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)),  # valid
-        ]
-
-    shapes_full = [
-        # test_lenet_28 shape
-        ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full
-        # test_lenet_32 shape
-        ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=1
-        # test_lenet_64 shape
-        ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full conv_full_patch_stack_padded' N=3
-        # test_lenet_108 shape
-        ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)),  # full 'conv_full_patch_stack_padded' N=9
-        # test_lenet_256 shape
-        ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)),  # full conv_reference_full
-        ]
-
-    version = [-1]
-    verbose = 1
-    random = True
-
-    exec_conv(version, shapes_valid, verbose, random, 'valid',
-              print_=None, rtol=1e-3)
-    exec_conv(version, shapes_full, verbose, random, 'full')
-
-
-def test_stack_rows_segfault_070312():
-    seed_rng()
-    # 07/03/2012
-    # Running this unittest with cuda-memcheck exposes an illegal read.
-    # THEANO_FLAGS=device=gpu cuda-memcheck nosetests \
-    # test_conv_cuda_ndarray.py:test_stack_rows_segfault_070312
-    img = theano.shared(numpy.random.rand(1, 80, 96, 96).astype('float32'))
-    kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
-    out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
-    op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
-                                        nkern=1, bsize=1)
-    f = theano.function([], [], updates=[(out, op(img, kern))], mode=mode_with_gpu)
-    f()
--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -32,7 +32,7 @@ def test_local_remove_all_assert():
    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())

    # By default `unsafe` should not be there
-    f = theano.function([x], a, mode=mode_with_gpu)
+    f = theano.function([x], a, mode=mode_with_gpu.excluding('unsafe'))
    topo = f.maker.fgraph.toposort()
    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
    assert len(a_op) == 1

--- a/theano/tensor/nnet/opt.py
+++ b/theano/tensor/nnet/opt.py
@@ -16,6 +16,7 @@ from theano.tensor.nnet.blocksparse import (
 from theano.tensor.nnet.abstract_conv import (AbstractConv2d,
                                              AbstractConv2d_gradWeights,
                                              AbstractConv2d_gradInputs)
+from theano.tensor.nnet.abstract_conv import get_conv_output_shape
 from theano.tensor.opt import register_specialize_device
 from theano.tensor import TensorType

@@ -199,12 +200,10 @@ def local_conv2d_gradweight_cpu(node):
    # Determine gradient on kernels
    assert len(op_imshp) == 4 and len(op_kshp) == 4

-    outshp = ConvOp.getOutputShape(op_imshp[2:],
-                                   op_kshp[2:], node.op.subsample,
-                                   node.op.border_mode)
-    fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
-                                       op_kshp[2:], (1, 1),
-                                       node.op.border_mode)
+    outshp = get_conv_output_shape(op_imshp, op_kshp,
+                                   node.op.border_mode, node.op.subsample)[2:]
+    fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
+                                       node.op.border_mode, (1, 1))[2:]

    newimg = img.dimshuffle((1, 0, 2, 3))
    newtopgrad = topgrad.dimshuffle((1, 0, 2, 3))
@@ -307,12 +306,11 @@ def local_conv2d_gradinputs_cpu(node):
    filters = kern.dimshuffle((1, 0, 2, 3))
    filters = filters[:, :, ::-1, ::-1]

-    outshp = ConvOp.getOutputShape(op_imshp[2:],
-                                   op_kshp[2:], node.op.subsample,
-                                   node.op.border_mode)
-    fulloutshp = ConvOp.getOutputShape(op_imshp[2:],
-                                       op_kshp[2:], (1, 1),
-                                       node.op.border_mode)
+    outshp = get_conv_output_shape(op_imshp, op_kshp,
+                                   node.op.border_mode, node.op.subsample)[2:]
+    fulloutshp = get_conv_output_shape(op_imshp, op_kshp,
+                                       node.op.border_mode, (1, 1))[2:]
+
    nkern = op_imshp[1]
    imshp = (op_kshp[0], outshp[0], outshp[1])
    imshp_logical = (op_kshp[0], fulloutshp[0], fulloutshp[1])