A first optimized implementation of conv2d on the with subsamble. Work only for some shape.

cf91f745 · Frederic Bastien · 899d98b6 · cf91f745 · cf91f745 · cf91f745
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,7 +363,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']
    def c_code_cache_version(self):
-        return (0,13) # raise this whenever modifying any of the support_code_files
+        return (0,14) # raise this whenever modifying any of the support_code_files
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of these files

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -163,8 +163,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
                                cudaGetErrorString(sts));
        }
    }
-    if (!subsample &&
+    if (out_contiguous &&
-	out_contiguous &&
 	(version==1||version==3||version==11||version==12||version==-1) &&
 	(version!=1 || out_size<512) &&//Maximum of 512 theads by block
 	out_wid<512 &&//Maximum of 512 theads by block
@@ -187,36 +186,54 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
        int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
 	void (*f)(float*, float*, float*,
+		  int, int, int, int,
 		  int, int, int, int,
 		  int, int, int, int,
 		  int, int, int, int,
 		  int, int);
 #define CONV_PATCH_STACK_SPECIAL(kern_wid) \
-	if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;} \
+	if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
-	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;} \
+	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
-	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;}\
+	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
-	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;}\
+	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;}\
+	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;}\
+	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;}\
+	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
-	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;}\
+	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
-	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;} \
+	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
-	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;} \
+	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
-	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;} \
+	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
-	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false>;}
+	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
+	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
+	else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
+	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
+	else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
+	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
+	else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
+	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
+	else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
+	else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
+	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
+	else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
+	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
+	else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
 	CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
 	f<<< grid, threads, shared_size>>>
 	     (img->devdata, kern->devdata, out->devdata,
-	      img_len, img_wid, kern_len, kern_wid, nkern, nstack,
+	      img_len, img_wid, kern_len, kern_wid, 
+	      out_len, out_wid, nkern, nstack,
 	      img_stride_col, img_stride_row, img_stride_stack,
 	      img_stride_batch, kern_stride_col, kern_stride_row,
-	      kern_stride_stack, kern_stride_nkern);
+	      kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
        CNDA_THREAD_SYNC;
        cudaError_t sts = cudaGetLastError();
@@ -226,13 +243,15 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	      fprintf(stderr,
                     "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
 		     " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
-		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n",
+		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,",
+		     " subsample_rows=%i, subsample_cols=%i\n",
 		     threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
 		     THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-		     nb_split, preload_full_kernel);
+		     nb_split, preload_full_kernel, subsample_rows, subsample_cols);
            if (verbose) fprintf(stderr,
-                    "INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i\n",
+                    "INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i,"
-				nb_split,preload_full_kernel);
+				 " subsample_rows=%i, subsample_cols=%i\n",
+				 nb_split,preload_full_kernel, subsample_rows, subsample_cols);
            work_complete = true;
        }
        else
@@ -240,10 +259,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
            if (verbose)
 	      fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
 		     " kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i,"
-		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i\n",
+		     " kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,",
+		     " subsample_rows=%i, subsample_cols=%i\n",
 		     threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,
 		     THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
-		     nb_split, preload_full_kernel);
+		     nb_split, preload_full_kernel, subsample_rows, subsample_cols);
            if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
                                cudaGetErrorString(sts));
        }

--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out,
 * 
 * nkern: the number of kernel, used to compute the output image to store the result
 * nstack: the size of the stack, used to compute the image to load.
+ * dx: patch stride rows(1 for normal convolution)
+ * dy: patch stride cols(1 for normal convolution)
 * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
 * template accumulate: if true, we add the result, else we override the result
 * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out,
 * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
 * template split: if true, each thread generate more then 1 output pixel, but use more registers.
 * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
+ * template subsample: if false, remove some computation needed when dx or dy!=1.
 */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern>
+template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
 __global__ void
 conv_patch_stack( float* img, float* kern, float* out,
 		  int img_len, int img_wid, int kern_len, int kern_wid,
+		  int out_len, int out_wid,
 		  int nkern, int nstack, int img_stride_col,int img_stride_row,
 		  int img_stride_stack, int img_stride_batch,
 		  int kern_stride_col, int kern_stride_row,
-		  int kern_stride_stack, int kern_stride_nkern)
+		  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
 {
-  int __shared__ out_len, out_wid, nb_thread_id;
+  int __shared__ nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
  extern __shared__ float s_data[];
@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 	  const float* idx_kern;
 	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	  else idx_kern=d_kern;
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  const float* idx_in;
+	  if(subsample)
+	    idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+	  else
+	    idx_in=&d_img[(row+out_row)*img_wid+out_col];
 	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
 	}
@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out,
      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
      //TODO: do this happen elsewhere?
-      for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){
+      for(;out_row<out_len_max;out_row+=blockDim.y){
 	float sum = 0.0f;
 	for (int stack = 0;stack<nstack;stack++){
 	  //TODO: load only the part of the image needed or put the partial result in shared memory
@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 	    const float* idx_kern;
 	    if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	    else idx_kern=d_kern;
-	    const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	    const float* idx_in;
+	    if(subsample)
+	      idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+	    else
+	      idx_in=&d_img[(row+out_row)*img_wid+out_col];
 	    //if needed as on Fermi as reading out of bound index from shared memory generate an error.
 	    //Not needed on generation before as they worked anyway. Removing the if generate the good code

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -282,8 +282,7 @@ def get_valid_shapes():
    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
-    #test subsample
+    #test subsample done in a separate fct
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
    shapes += [
         #other test
@@ -502,8 +501,7 @@ def test_full():
    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
-    #test subsample
+    #test subsample done in a separate fct
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
    shapes += [
        #other test
@@ -552,22 +550,32 @@ def test_full():
 def test_subsample():
    # implement when
-    shapes = [
+    shapes = [ 
-            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1))
+            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
-            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2))
+            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
            ]
-    all_good = True
+    shapes += get_shapes2(scales_img=(2,2),subsample=(1,1))
+    shapes += get_shapes2(scales_img=(2,2),subsample=(1,2))
-    _params_allgood_header()
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,1))
-    for ishape, kshape, ds in shapes:
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
-        if not _params_allgood(ishape, kshape, 'full', subsample=ds):
-            all_good = False
+#We put only the version that implement the subsample to make the test faster.
-        if not _params_allgood(ishape, kshape, 'valid', subsample=ds):
+    version_valid = [-2,-1,1,3,11,12]
-            all_good = False
+    version_full = [-2,-1]
-    assert all_good
+    verbose = 0
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    #test
+    random = False
+    exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones)
+    exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones)
 ## See #616
 #def test_logical_shapes():