Force instantiate kernel templates

0d5cffbe · Sean Lee · 89f584bc · 0d5cffbe · 0d5cffbe · 0d5cffbe
--- a/theano/sandbox/gpuarray/conv.cu
+++ b/theano/sandbox/gpuarray/conv.cu
@@ -203,11 +203,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
                  int, int, int, int,
                  int, int);

-#define CONV_PATCH_SPECIAL(kern_wid) \
-            if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
-            else f=conv_patch<true,kern_wid,true>;
-
-        CONV_PATCH_SPECIAL(THEANO_KERN_WID);
+        if(threads.y==out_len) f=conv_patch_2;
+        else f=conv_patch_3;

         f<<< grid, threads, shared_size>>>
             (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
@@ -267,41 +264,39 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
                  int, int, int, int,
                  int, int);

-#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-        if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
-        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
-        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
-        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
-
-        CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
+        if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_64;}
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_65;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_66;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_67;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_68;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_69;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_70;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_71;}
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_72;}
+        else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_73;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_74;}
+        else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_75;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_76;}
+        else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_77;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_78;}
+        else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_79;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_80;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_81;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_82;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_83;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_84;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_85;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_86;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_87;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_88;}
+        else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_89;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_90;}
+        else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_91;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_92;}
+        else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_93;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_94;}
+        else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_95;}
+
        f<<< grid, threads, shared_size>>>
            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
              img_len, img_wid, kern_len, kern_wid, 
@@ -380,11 +375,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
                  int, int, int, int,
                  int, int);

-#define CONV_ROWS_SPECIAL(kern_wid) \
-        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
-        else f = conv_rows<kern_wid, true>;\
+        if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_0;
+        else f = conv_rows_1;

-        CONV_ROWS_SPECIAL(THEANO_KERN_WID);
        f<<< grid, threads, shared_size >>>
            (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
           img_len, img_wid, kern_len, kern_wid, nkern, nstack,
@@ -450,10 +443,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

        if(!img_contiguous_2d || !kern_contiguous_2d) {
            //fprintf(stderr, "using false version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, false>;
+            f = conv_rows_stack_0;
        } else {
            //fprintf(stderr, "using true version\n");
-            f = conv_rows_stack<THEANO_KERN_WID, true>;
+            f = conv_rows_stack_1;
        }

        f<<< grid, threads, shared_size >>>
@@ -535,13 +528,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
                  int, int, int, int,
                  int, int);

-#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
-        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
-        else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
-        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
-        else f = conv_rows_stack2<kern_wid, true, false>;
-
-        CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
+        if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2_1;
+        else if(version==9) f = conv_rows_stack2_3;
+        else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2_0;
+        else f = conv_rows_stack2_2;

        f<<< grid, threads, shared_size >>>
            (cuda_get_ptr(img),
@@ -663,24 +653,23 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,

            //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
            //We will always be split when we don't load the full kernel
-#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
-                if     (kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
-                else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
-                else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
-                else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
-                else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
-                else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
-                else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
-                else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
-                /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
-                /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
-                else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
-                else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
-                /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
-                /*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
-                else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
-                else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
-            CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
+
+            /* if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_0;*/
+            /*else*/ if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_1;
+            else if(!kern_flipped && !ccontig  && split && !full_kern) f=conv_patch_stack_reduce_2;
+            else if(!kern_flipped && !ccontig  && split && full_kern) f=conv_patch_stack_reduce_3;
+            /*else if(!kern_flipped && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce_4;*/
+            else if(!kern_flipped && ccontig  && !split && full_kern) f=conv_patch_stack_reduce_5;
+            else if(!kern_flipped && ccontig  && split && !full_kern) f=conv_patch_stack_reduce_6;
+            else if(!kern_flipped && ccontig  && split && full_kern) f=conv_patch_stack_reduce_7;
+            /*else if(kern_flipped  && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_8;*/
+            else if(kern_flipped  && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_9;
+            else if(kern_flipped  && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_10;
+            else if(kern_flipped  && !ccontig && split && full_kern) f=conv_patch_stack_reduce_11;
+            /*else if(kern_flipped  && ccontig  && !split && !full_kern) f=conv_patch_stack_reduce_12;*/
+            else if(kern_flipped  && ccontig  && !split && full_kern) f=conv_patch_stack_reduce_13;
+            else if(kern_flipped  && ccontig  && split && !full_kern) f=conv_patch_stack_reduce_14;
+            else if(kern_flipped  && ccontig  && split && full_kern) f=conv_patch_stack_reduce_15;

            f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
                                               img_len, img_wid, kern_len, kern_wid,
@@ -770,9 +759,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
        //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
        //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
        if(block_nstack==nstack)
-          f=conv_valid_row_reduce<false>;
+          f=conv_valid_row_reduce_0;
        else
-          f=conv_valid_row_reduce<true>;
+          f=conv_valid_row_reduce_1;
        f<<<n_blocks, n_threads, n_reduce_buf>>>(
                nbatch, nkern, PyGpuArray_DIMS(img)[1],
                img_len, img_wid,
@@ -1105,23 +1094,20 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
                  int, int, int, int,
                  int, int);

-#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
-             if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
-        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
-        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
-        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
-        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
-        else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
-        else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
-        else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
+        if(version==3) f=conv_full_patch_stack_padded_0;
+        else if(version==5) f=conv_full_patch_stack_padded_1;
+        else if(version==4) f=conv_full_patch_stack_padded_2;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded_4;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded_5;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded_6;
+        else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded_8;
+        else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded_9;
+        else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded_10;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded_12;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded_13;
+        else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded_14;
        else assert(false);

-        CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
-
        f<<< grid, threads, shared_size>>>
            (cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
              img_len, img_wid, kern_len, kern_wid, nkern, nstack,
@@ -1225,9 +1211,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
        //typeof(conv_full_load_everything<0>) f = ;
        void (*f)(const float*, const float*, float*,
                  int, int, int, int, int, int,
-                  int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
-
-        f = conv_full_load_everything<THEANO_KERN_WID>;
+                  int, int, int, int, int, int, int, int) = conv_full_load_everything;

        f<<< grid, threads, shared_size>>>
            (cuda_get_ptr(img),
@@ -1284,10 +1268,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
                  int, int, int, int,
                  int, int, int, int);

-        if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
-        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
-        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
-        else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
+        if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_0;
+        else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_1;
+        else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_2;
+        else if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_3;

        f<<< grid, threads, shared_size>>>(
                cuda_get_ptr(img),

--- a/theano/sandbox/gpuarray/conv_full_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_full_kernel.cu
@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out,
 //template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.

 template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
-__global__ void
+__device__ inline void
 conv_full_patch_stack( const float* img, const float* kern, float* out,
                       int img_len, int img_wid,
                       int kern_len, int kern_wid, int nkern, int nstack,
@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
        out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
+__global__ void \
+conv_full_patch_stack_##suffix( \
+    const float *img, const float *kern, float *out, \
+    int img_len, int img_wid, \
+    int kern_len, int kern_wid, int nkern, int nstack, \
+    int img_stride_col, int img_stride_row, \
+    int kern_stride_col, int kern_stride_row,  \
+    int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_full_patch_stack<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, kern_stride_col, kern_stride_row,  \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
+
+#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
+}
+
 /**
 * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
 * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
 * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
 *          if you set split and low_mem to true, we will use the low_mem version!
 */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
-__global__ void
+template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
+__device__ inline void
 conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
                  const int img_len, const int img_wid,
                  const int kern_len, const int kern_wid,
@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
          
-          convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+          convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
        }
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
              const float* idx_kern=&d_kern[row*kern_wid];
              const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
              
-              convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+              convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
            }
          if(out_row<out_len)
            out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
            const float* idx_kern=&d_kern[row*kern_wid];
            const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
            
-            convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+            convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
          }
        }
        if(out_row<out_len)
@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
+__global__ void \
+conv_full_patch_stack_padded_##suffix( \
+    const float *img, const float *kern, float *out, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_full_patch_stack_padded<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
+__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
+
+#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
+}
+
 template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) 
 { 
    return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
 { 
    return x[0] * y[0];
 }
-template<int NSTACK>
 __global__ void
 conv_full_load_everything( const float* img, const float* kern, float* out,
                 int img_len, int img_wid,
@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
            {
                int icol = out_col - kern_wid+1+col;
                if (icol < 0 || icol > img_wid) continue;
-                if (NSTACK > 0)
+                if (THEANO_KERN_WID > 0)
                {
-                    sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
+                    sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
                            d_kern + row*kern_wid+col, kern_len*kern_wid);
                }
                else

--- a/theano/sandbox/gpuarray/conv_kernel.cu
+++ b/theano/sandbox/gpuarray/conv_kernel.cu
@@ -182,12 +182,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data,
    return 0;
 }

-template<int KERN_WIDTH>
 __device__ void convolutionRowNoFlip(float& sum,
                                     const float *data,
                                     const float *kern, const int kern_wid){
-  if(KERN_WIDTH>0)
-    sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
+  if(THEANO_KERN_WID>0)
+    sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
  else
 #pragma unroll 8
    for (int col=0; col < kern_wid; col++) {//loop over col
@@ -219,8 +218,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){
 *                 When true, allow for output image bigger then 512 pixel.
 *                 Use more registers.
 */
-template<bool flipped_kern, int KERN_WIDTH, bool split>
-__global__ void
+template<bool flipped_kern, bool split>
+__device__ inline void
 conv_patch( const float* img, const float* kern, float* out,
            int img_len, int img_wid, int kern_len, int kern_wid,
            int nkern, int nstack)
@@ -260,7 +259,7 @@ conv_patch( const float* img, const float* kern, float* out,
      for (int row=0; row < kern_len; row++) {//loop over row
        const float* idx_kern=&d_kern[row*kern_wid];
        const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
      }
      out[batch_id*out_wid*out_len*nkern+//the good batch
          blockIdx.y*out_wid*out_len+//the output image
@@ -271,7 +270,7 @@ conv_patch( const float* img, const float* kern, float* out,
        for (int row=0; row < kern_len; row++) {//loop over row
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
        }
        out[batch_id*out_wid*out_len*nkern+//the good batch
            kern_id*out_wid*out_len+//the output image
@@ -280,6 +279,24 @@ conv_patch( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
+__global__ void \
+conv_patch_##suffix(const float *img, const float *kern, float *out, \
+                    int img_len, int img_wid, int kern_len, int kern_wid, \
+                    int nkern, int nstack) \
+{ \
+    conv_patch<__VA_ARGS__>(img, kern, out, img_len, img_wid, kern_len, \
+                            kern_wid, nkern, nstack); \
+}
+
+__INSTANTIATE_CONV_PATCH(2, true, false)
+__INSTANTIATE_CONV_PATCH(3, true, true)
+
+#undef __INSTANTIATE_CONV_PATCH
+}
+
+
 /**
 * As conv_patch, but implement the stack in the kernel.
 * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
@@ -295,15 +312,14 @@ conv_patch( const float* img, const float* kern, float* out,
 * dy: patch stride cols(1 for normal convolution)
 * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
 * template accumulate: if true, we add the result, else we override the result
- * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
 * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
 * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
 * template split: if true, each thread generate more than 1 output pixel, but use more registers.
 * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
 * template subsample: if false, remove some computation needed when dx or dy!=1.
 */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
-__global__ void
+template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
+__device__ inline void
 conv_patch_stack( const float* img, const float* kern, float* out,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int out_len, int out_wid,
@@ -365,7 +381,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
          else
            idx_in=&d_img[(row+out_row)*img_wid+out_col];
          
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
        }
        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
      }
@@ -425,7 +441,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
            //as we store the result of only the good thread.
            //This was with nvcc 3.0 on an GTX470 card.
            if(out_row<out_len)
-              convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+              convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
          }
          __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
        }
@@ -440,6 +456,63 @@ conv_patch_stack( const float* img, const float* kern, float* out,

 }

+extern "C" {
+#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
+__global__ void \
+conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
+                          int img_len, int img_wid, int kern_len, int kern_wid, \
+                          int out_len, int out_wid, int nkern, int nstack, \
+                          int img_stride_col, int img_stride_row, \
+                          int img_stride_stack, int img_stride_batch, \
+                          int kern_stride_col, int kern_stride_row, \
+                          int kern_stride_stack, int kern_stride_nkern, \
+                          int dx, int dy) \
+{ \
+    conv_patch_stack<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, out_len, \
+        out_wid, nkern, nstack, img_stride_col, img_stride_row, \
+        img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern, dx, dy); \
+}
+
+__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
+__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
+__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
+__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
+__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
+
+#undef __INSTANTIATE_CONV_PATCH_STACK
+}
+
+
 /**
 * As conv_patch_stack, but kern_len thread for each output pixel
 * I keep it separated as use more register.
@@ -454,8 +527,8 @@ conv_patch_stack( const float* img, const float* kern, float* out,
 * template img_contiguous: if true, the img have are collon and row contiguous
 * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
 */
-template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
-__global__ void
+template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
+__device__ inline void
 conv_patch_stack_reduce( const float* img, const float* kern, float* out,
                  int img_len, int img_wid, int kern_len, int kern_wid,
                  int nkern, int nstack, int img_stride_col,int img_stride_row,
@@ -521,7 +594,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
          const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
          float sum2 = 0;
          if(tz<len3)
-            convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
+            convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
          sum+=sum2;
        }
      }else if(split){
@@ -531,7 +604,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
        for(int row=tz;row<kern_len;row+=blockDim.z){
          const float* idx_kern=&d_kern[row*kern_wid];
          const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
-          convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
        }
      }else{
        int row = tz;//The row of the kernel.
@@ -540,7 +613,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
        load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
                       kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
        __syncthreads();
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
      }
        __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
    }
@@ -559,6 +632,45 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
+__global__ void \
+conv_patch_stack_reduce_##suffix( \
+    const float *img, const float *kern, float *out, \
+    int img_len, int img_wid, int kern_len, int kern_wid, \
+    int nkern, int nstack, int img_stride_col, int img_stride_row, \
+    int img_stride_stack, int img_stride_batch, \
+    int kern_stride_col, int kern_stride_row, \
+    int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_patch_stack_reduce<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(0, false, false, false, false);*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true);
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(4, false, true, false, false);*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true);
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false);*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true);
+/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false);*/
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false);
+__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
+
+#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
+}
+
+
 /**
 * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
 * we store kern_len row of the image and the full kernel in the shared memory
@@ -570,8 +682,8 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
 * Diff with conv_patch: don't store the full image in the shared memory. 
 *    I.E. work for bigger image then conv_patch<split=true,...>.
 */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
+template<bool c_contiguous>
+__device__ inline void
 conv_rows( const float* img, const float* kern, float* out,
           int img_len, int img_wid, int kern_len, int kern_wid,
           int nkern, int nstack,
@@ -612,7 +724,7 @@ conv_rows( const float* img, const float* kern, float* out,
    for (int row=0; row < kern_len; row++) {//loop over row
      const float* idx_kern=&d_kern[row*kern_wid];
      const float* idx_in=&d_img[(row)*img_wid+out_col];
-      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+      convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
    }

    out[batch_id*out_wid*out_len*nkern+//the good batch
@@ -620,6 +732,31 @@ conv_rows( const float* img, const float* kern, float* out,
        out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
+__global__ void \
+conv_rows_##suffix(const float *img, const float *kern, float *out, \
+                   int img_len, int img_wid, int kern_len, int kern_wid, \
+                   int nkern, int nstack, \
+                   int img_stride_col, int img_stride_row, \
+                   int img_stride_stack, int img_stride_batch, \
+                   int kern_stride_col, int kern_stride_row, \
+                   int kern_stride_stack, int kern_stride_nkern) \
+{ \
+    conv_rows<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, \
+        nkern, nstack, img_stride_col, img_stride_row, \
+        img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_ROWS(0, false)
+__INSTANTIATE_CONV_ROWS(1, true)
+
+#undef __INSTANTIATE_CONV_ROWS
+}
+
 /**
 * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
 * as conv_rows, but implement the stack. Separate as this use more register.
@@ -631,8 +768,8 @@ conv_rows( const float* img, const float* kern, float* out,
 * Diff with conv_patch: don't store the full image in the shared memory. 
 *    I.E. work for bigger image then conv_patch<split=true,...>.
 */
-template<int KERN_WIDTH, bool c_contiguous>
-__global__ void
+template<bool c_contiguous>
+__device__ inline void
 conv_rows_stack( const float* img, const float* kern, float* out,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
@@ -708,7 +845,7 @@ conv_rows_stack( const float* img, const float* kern, float* out,
      for (int row=0; row < kern_len; row++) {//loop over row
        const float* idx_kern=&d_kern[row*kern_wid];
        const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
-        convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
+        convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
      }
      __syncthreads();//to be sure all thread have finished before we modif the shared memory.
    }
@@ -718,6 +855,33 @@ conv_rows_stack( const float* img, const float* kern, float* out,
          out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
+__global__ void \
+conv_rows_stack_##suffix( \
+    const float *img, const float *kern, float *out, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_rows_stack<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, kern_len, kern_wid, \
+        nkern, nstack, img_stride_col, img_stride_row, \
+        img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_ROWS_STACK(0, false)
+__INSTANTIATE_CONV_ROWS_STACK(1, true)
+
+#undef __INSTANTIATE_CONV_ROWS_STACK
+}
+
 /**
 * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
 * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
@@ -729,8 +893,8 @@ conv_rows_stack( const float* img, const float* kern, float* out,
 * Diff with conv_patch: don't store the full image and kernel in the shared memory. 
 *    I.E. work for bigger image then conv_patch<split=true,...>.
 */
-template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
-__global__ void
+template<bool c_contiguous, bool preload_full_kern>
+__device__ inline void
 conv_rows_stack2(const float* img, const float* kern, float* out,
                 const int img_len, const int img_wid, const int kern_len, const int kern_wid,
                 const int nkern, const int nstack,
@@ -804,7 +968,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
          else idx_kern=d_kern;
          const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
          float sum_ =0.0f;
-          convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
+          convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
          sum+=sum_;//We pass by an intermediate variable to have more precission.
        }
      }
@@ -816,6 +980,35 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
          out_row*out_wid+out_col] = sum;
 }

+extern "C" {
+#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
+__global__ void \
+conv_rows_stack2_##suffix( \
+    const float *img, const float *kern, float *out, \
+    const int img_len, const int img_wid, \
+    const int kern_len, const int kern_wid, \
+    const int nkern, const int nstack, \
+    const int img_stride_col, const int img_stride_row, \
+    const int img_stride_stack, const int img_stride_batch, \
+    const int kern_stride_col, const int kern_stride_row, \
+    const int kern_stride_stack, const int kern_stride_nkern) \
+{ \
+    conv_rows_stack2<__VA_ARGS__>( \
+        img, kern, out, img_len, img_wid, \
+        kern_len, kern_wid, nkern, nstack, \
+        img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
+        kern_stride_col, kern_stride_row, \
+        kern_stride_stack, kern_stride_nkern); \
+}
+
+__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
+__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
+__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
+__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
+
+#undef __INSTANTIATE_CONV_ROWS_STACK2
+}
+
 /**
 * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
 * kernel-image inner-product in parallel.
@@ -826,7 +1019,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
 * TODO: explain parameters, preconditions
 */
 template<bool stack_loop>
-__global__ void
+__device__ inline void
 conv_valid_row_reduce(int nB, int nK, int stacklen,
        int img_len, int img_wid, 
        int kern_len, int kern_wid,
@@ -911,6 +1104,32 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
    }
 }

+extern "C" {
+#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
+__global__ void \
+conv_valid_row_reduce_##suffix( \
+    int nB, int nK, int stacklen, int img_len, int img_wid,  \
+    int kern_len, int kern_wid, int out_len, int out_wid, \
+    const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
+    const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
+    float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
+    int subsample_rows, int subsample_cols, \
+    const int initial_reduce_boundary) \
+{ \
+    conv_valid_row_reduce<__VA_ARGS__>( \
+        nB, nK, stacklen, img_len, img_wid, \
+        kern_len, kern_wid, out_len, out_wid, \
+        img, img_str_B, img_str_S, img_str_R, img_str_C, \
+        kern, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
+        out, out_str_B, out_str_K, out_str_R, out_str_C, \
+        subsample_rows, subsample_cols, initial_reduce_boundary); \
+}
+
+__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
+__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
+
+#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
+}


 /**