Merge pull request #5165 from gvtulder/f-fix-gpuarray-corr-threads

Correct blocks/threads for gpuarray CorrMM and Corr3DMM

Merge pull request #5165 from gvtulder/f-fix-gpuarray-corr-threads
0e44e828 · Frédéric Bastien · GitHub · 8be4a819 · b09ec230 · 0e44e828
--- a/theano/gpuarray/blas.py
+++ b/theano/gpuarray/blas.py
@@ -490,7 +490,7 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 1)
+        return (0, 2)
    def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
        """
@@ -1029,7 +1029,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
    def c_code_cache_version(self):
        # raise this whenever modifying any of the support_code_files
-        return (0, 1)
+        return (0, 2)
    def c_code_helper(self, bottom, weights, top, direction, sub,
                      height=None, width=None, depth=None):

--- a/theano/gpuarray/corr3d_gemm.c
+++ b/theano/gpuarray/corr3d_gemm.c
@@ -263,7 +263,7 @@ int im3d2col(const size_t max_threads_dim,
  GpuKernel *kernel;
  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
    err = dilated_im3d2col_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_im, data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
      pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
@@ -276,7 +276,7 @@ int im3d2col(const size_t max_threads_dim,
  }
  else{
    err = im3d2col_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_im, data_im_offset, height, width, depth,
      kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
@@ -311,7 +311,7 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
  int err;
  if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
    err = dilated_col2im3d_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
      patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
      stride_h, stride_w, stride_d, height_col, width_col, depth_col,
@@ -324,7 +324,7 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
  }
  else{
    err = col2im3d_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
      patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
      height_col, width_col, depth_col, data_im, data_im_offset);

--- a/theano/gpuarray/corr_gemm.c
+++ b/theano/gpuarray/corr_gemm.c
@@ -215,7 +215,7 @@ int im2col(const size_t max_threads_dim,
  GpuKernel *kernel;
  if(dilation_h != 1 || dilation_w != 1){
    err = dilated_im2col_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
@@ -227,7 +227,7 @@ int im2col(const size_t max_threads_dim,
  }
  else{
    err = im2col_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
      pad_h, pad_w, stride_h, stride_w, height_col,
      width_col, data_col);
@@ -257,7 +257,7 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
  int err;
  if(dilation_h != 1 || dilation_w != 1){
    err = dilated_col2im_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_col, height, width, channels, patch_h, patch_w,
      dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
      height_col, width_col, data_im, data_im_offset);
@@ -269,7 +269,7 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
  }
  else{
    err = col2im_kernel_call(
-      1, &threads_per_block, &n_blocks, 0,
+      1, &n_blocks, &threads_per_block, 0,
      num_kernels, data_col, height, width, channels, patch_h, patch_w,
      pad_h, pad_w, stride_h, stride_w,
      height_col, width_col, data_im, data_im_offset);

--- a/theano/gpuarray/tests/test_gemmcorr.py
+++ b/theano/gpuarray/tests/test_gemmcorr.py
@@ -206,3 +206,13 @@ class TestCorrMM(unittest.TestCase):
        self.run_gradinput(inputs_shape=(16, 15, 12, 10),
                           filters_shape=(10, 6, 12, 1),
                           subsample=(3, 1))
+    def test_large_input(self):
+        # This tests the number-of-threads computation
+        # by making (channels * height) > (max_threads_dim ** 2).
+        # (See also issue #5165.)
+        self.run_conv_valid(inputs_shape=(1, 1024, 3, 1024),
+                            filters_shape=(1, 1, 1, 1024),
+                            verify_grad=False)
+        self.run_gradinput(inputs_shape=(1, 1024, 3, 1),
+                           filters_shape=(1, 1, 1, 1024))
--- a/theano/gpuarray/tests/test_gemmcorr3d.py
+++ b/theano/gpuarray/tests/test_gemmcorr3d.py
@@ -207,3 +207,13 @@ class TestCorr3dMM(unittest.TestCase):
        self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
                           filters_shape=(10, 6, 12, 4, 1),
                           subsample=(3, 1, 2))
+    def test_large_input(self):
+        # This tests the number-of-threads computation
+        # by making (channels * height) > (max_threads_dim ** 2).
+        # (See also issue #5165.)
+        self.run_conv_valid(inputs_shape=(1, 1024, 3, 3, 1024),
+                            filters_shape=(1, 1, 1, 1, 1024),
+                            verify_grad=False)
+        self.run_gradinput(inputs_shape=(1, 1024, 3, 3, 1),
+                           filters_shape=(1, 1, 1, 1, 1024))