A first optimized implementation of conv2d on the with subsamble. Work only for some shape.

cf91f745 · Frederic Bastien · 899d98b6 · cf91f745 · cf91f745 · cf91f745
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,7 +363,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']
    def c_code_cache_version(self):
-        return (0,13) # raise this whenever modifying any of the support_code_files
+        return (0,14) # raise this whenever modifying any of the support_code_files
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of these files

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out,
 * 
 * nkern: the number of kernel, used to compute the output image to store the result
 * nstack: the size of the stack, used to compute the image to load.
+ * dx: patch stride rows(1 for normal convolution)
+ * dy: patch stride cols(1 for normal convolution)
 * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
 * template accumulate: if true, we add the result, else we override the result
 * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out,
 * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
 * template split: if true, each thread generate more then 1 output pixel, but use more registers.
 * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
+ * template subsample: if false, remove some computation needed when dx or dy!=1.
 */
-template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern>
+template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
 __global__ void
 conv_patch_stack( float* img, float* kern, float* out,
 		  int img_len, int img_wid, int kern_len, int kern_wid,
+		  int out_len, int out_wid,
 		  int nkern, int nstack, int img_stride_col,int img_stride_row,
 		  int img_stride_stack, int img_stride_batch,
 		  int kern_stride_col, int kern_stride_row,
-		  int kern_stride_stack, int kern_stride_nkern)
+		  int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
 {
-  int __shared__ out_len, out_wid, nb_thread_id;
+  int __shared__ nb_thread_id;
-  out_len = img_len - kern_len + 1;
-  out_wid = img_wid - kern_wid + 1;
  nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
  extern __shared__ float s_data[];
@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 	  const float* idx_kern;
 	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	  else idx_kern=d_kern;
-	  const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	  const float* idx_in;
+	  if(subsample)
+	    idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+	  else
+	    idx_in=&d_img[(row+out_row)*img_wid+out_col];
 	  convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
 	}
@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out,
      //TODO: inverse the out_row and stack loop to don't load the date as frequently!
      //TODO: do this happen elsewhere?
-      for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){
+      for(;out_row<out_len_max;out_row+=blockDim.y){
 	float sum = 0.0f;
 	for (int stack = 0;stack<nstack;stack++){
 	  //TODO: load only the part of the image needed or put the partial result in shared memory
@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out,
 	    const float* idx_kern;
 	    if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
 	    else idx_kern=d_kern;
-	    const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
+	    const float* idx_in;
+	    if(subsample)
+	      idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
+	    else
+	      idx_in=&d_img[(row+out_row)*img_wid+out_col];
 	    //if needed as on Fermi as reading out of bound index from shared memory generate an error.
 	    //Not needed on generation before as they worked anyway. Removing the if generate the good code

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -282,8 +282,7 @@ def get_valid_shapes():
    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
-    #test subsample
+    #test subsample done in a separate fct
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
    shapes += [
         #other test
@@ -502,8 +501,7 @@ def test_full():
    shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
    shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
-    #test subsample
+    #test subsample done in a separate fct
-    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
    shapes += [
        #other test
@@ -552,22 +550,32 @@ def test_full():
 def test_subsample():
    # implement when
-    shapes = [
+    shapes = [ 
-            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1))
+            ((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
-            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2))
+            , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
-            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1))
+            , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
            ]
-    all_good = True
+    shapes += get_shapes2(scales_img=(2,2),subsample=(1,1))
+    shapes += get_shapes2(scales_img=(2,2),subsample=(1,2))
-    _params_allgood_header()
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,1))
-    for ishape, kshape, ds in shapes:
+    shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
-        if not _params_allgood(ishape, kshape, 'full', subsample=ds):
-            all_good = False
+#We put only the version that implement the subsample to make the test faster.
-        if not _params_allgood(ishape, kshape, 'valid', subsample=ds):
+    version_valid = [-2,-1,1,3,11,12]
-            all_good = False
+    version_full = [-2,-1]
-    assert all_good
+    verbose = 0
+    random = True
+    print_ = False
+    ones = False
+    if ones:
+        random = False
+    #test
+    random = False
+    exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones)
+    exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones)
 ## See #616
 #def test_logical_shapes():