提交 b09ec230 authored 作者: Gijs van Tulder's avatar Gijs van Tulder

Correct blocks/threads for gpuarray CorrMM.

Number of blocks and number of threads were swapped.
上级 1af6f15d
...@@ -490,7 +490,7 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp): ...@@ -490,7 +490,7 @@ class BaseGpuCorrMM(CGpuKernelBase, BlasOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 1) return (0, 2)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
""" """
...@@ -1029,7 +1029,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp): ...@@ -1029,7 +1029,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase, BlasOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 1) return (0, 2)
def c_code_helper(self, bottom, weights, top, direction, sub, def c_code_helper(self, bottom, weights, top, direction, sub,
height=None, width=None, depth=None): height=None, width=None, depth=None):
......
...@@ -263,7 +263,7 @@ int im3d2col(const size_t max_threads_dim, ...@@ -263,7 +263,7 @@ int im3d2col(const size_t max_threads_dim,
GpuKernel *kernel; GpuKernel *kernel;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){ if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
err = dilated_im3d2col_kernel_call( err = dilated_im3d2col_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, depth, num_kernels, data_im, data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d, kernel_h, kernel_w, kernel_d, dilation_h, dilation_w, dilation_d,
pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, height_col,
...@@ -276,7 +276,7 @@ int im3d2col(const size_t max_threads_dim, ...@@ -276,7 +276,7 @@ int im3d2col(const size_t max_threads_dim,
} }
else{ else{
err = im3d2col_kernel_call( err = im3d2col_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, depth, num_kernels, data_im, data_im_offset, height, width, depth,
kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d, kernel_h, kernel_w, kernel_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col, stride_h, stride_w, stride_d, height_col, width_col, depth_col,
...@@ -311,7 +311,7 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan ...@@ -311,7 +311,7 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
int err; int err;
if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){ if(dilation_h != 1 || dilation_w != 1 || dilation_d != 1){
err = dilated_col2im3d_kernel_call( err = dilated_col2im3d_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d, patch_d, dilation_h, dilation_w, dilation_d, pad_h, pad_w, pad_d,
stride_h, stride_w, stride_d, height_col, width_col, depth_col, stride_h, stride_w, stride_d, height_col, width_col, depth_col,
...@@ -324,7 +324,7 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan ...@@ -324,7 +324,7 @@ int col2im3d(const size_t max_threads_dim, gpudata * data_col, const size_t chan
} }
else{ else{
err = col2im3d_kernel_call( err = col2im3d_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_col, height, width, depth, channels, patch_h, patch_w, num_kernels, data_col, height, width, depth, channels, patch_h, patch_w,
patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d, patch_d, pad_h, pad_w, pad_d, stride_h, stride_w, stride_d,
height_col, width_col, depth_col, data_im, data_im_offset); height_col, width_col, depth_col, data_im, data_im_offset);
......
...@@ -215,7 +215,7 @@ int im2col(const size_t max_threads_dim, ...@@ -215,7 +215,7 @@ int im2col(const size_t max_threads_dim,
GpuKernel *kernel; GpuKernel *kernel;
if(dilation_h != 1 || dilation_w != 1){ if(dilation_h != 1 || dilation_w != 1){
err = dilated_im2col_kernel_call( err = dilated_im2col_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w, num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col); width_col, data_col);
...@@ -227,7 +227,7 @@ int im2col(const size_t max_threads_dim, ...@@ -227,7 +227,7 @@ int im2col(const size_t max_threads_dim,
} }
else{ else{
err = im2col_kernel_call( err = im2col_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w, num_kernels, data_im, data_im_offset, height, width, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, height_col, pad_h, pad_w, stride_h, stride_w, height_col,
width_col, data_col); width_col, data_col);
...@@ -257,7 +257,7 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe ...@@ -257,7 +257,7 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
int err; int err;
if(dilation_h != 1 || dilation_w != 1){ if(dilation_h != 1 || dilation_w != 1){
err = dilated_col2im_kernel_call( err = dilated_col2im_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w, num_kernels, data_col, height, width, channels, patch_h, patch_w,
dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset); height_col, width_col, data_im, data_im_offset);
...@@ -269,7 +269,7 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe ...@@ -269,7 +269,7 @@ int col2im(const size_t max_threads_dim, gpudata * data_col, const size_t channe
} }
else{ else{
err = col2im_kernel_call( err = col2im_kernel_call(
1, &threads_per_block, &n_blocks, 0, 1, &n_blocks, &threads_per_block, 0,
num_kernels, data_col, height, width, channels, patch_h, patch_w, num_kernels, data_col, height, width, channels, patch_h, patch_w,
pad_h, pad_w, stride_h, stride_w, pad_h, pad_w, stride_h, stride_w,
height_col, width_col, data_im, data_im_offset); height_col, width_col, data_im, data_im_offset);
......
...@@ -206,3 +206,13 @@ class TestCorrMM(unittest.TestCase): ...@@ -206,3 +206,13 @@ class TestCorrMM(unittest.TestCase):
self.run_gradinput(inputs_shape=(16, 15, 12, 10), self.run_gradinput(inputs_shape=(16, 15, 12, 10),
filters_shape=(10, 6, 12, 1), filters_shape=(10, 6, 12, 1),
subsample=(3, 1)) subsample=(3, 1))
def test_large_input(self):
# This tests the number-of-threads computation
# by making (channels * height) > (max_threads_dim ** 2).
# (See also issue #5165.)
self.run_conv_valid(inputs_shape=(1, 1024, 3, 1024),
filters_shape=(1, 1, 1, 1024),
verify_grad=False)
self.run_gradinput(inputs_shape=(1, 1024, 3, 1),
filters_shape=(1, 1, 1, 1024))
...@@ -207,3 +207,13 @@ class TestCorr3dMM(unittest.TestCase): ...@@ -207,3 +207,13 @@ class TestCorr3dMM(unittest.TestCase):
self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10), self.run_gradinput(inputs_shape=(16, 15, 12, 12, 10),
filters_shape=(10, 6, 12, 4, 1), filters_shape=(10, 6, 12, 4, 1),
subsample=(3, 1, 2)) subsample=(3, 1, 2))
def test_large_input(self):
# This tests the number-of-threads computation
# by making (channels * height) > (max_threads_dim ** 2).
# (See also issue #5165.)
self.run_conv_valid(inputs_shape=(1, 1024, 3, 3, 1024),
filters_shape=(1, 1, 1, 1, 1024),
verify_grad=False)
self.run_gradinput(inputs_shape=(1, 1024, 3, 3, 1),
filters_shape=(1, 1, 1, 1, 1024))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论