提交 cf91f745 authored 作者: Frederic Bastien's avatar Frederic Bastien

A first optimized implementation of conv2d on the with subsamble. Work only for some shape.

上级 899d98b6
...@@ -363,7 +363,7 @@ class GpuConv(Op): ...@@ -363,7 +363,7 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,13) # raise this whenever modifying any of the support_code_files return (0,14) # raise this whenever modifying any of the support_code_files
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files # REMEMBER TO RAISE c_code_cache_version when changing any of these files
......
...@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out, ...@@ -280,6 +280,8 @@ conv_patch( float* img, float* kern, float* out,
* *
* nkern: the number of kernel, used to compute the output image to store the result * nkern: the number of kernel, used to compute the output image to store the result
* nstack: the size of the stack, used to compute the image to load. * nstack: the size of the stack, used to compute the image to load.
* dx: patch stride rows(1 for normal convolution)
* dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result * template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization * template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
...@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out, ...@@ -287,19 +289,19 @@ conv_patch( float* img, float* kern, float* out,
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more then 1 output pixel, but use more registers. * template split: if true, each thread generate more then 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time. * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1.
*/ */
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern> template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __global__ void
conv_patch_stack( float* img, float* kern, float* out, conv_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -346,7 +348,11 @@ conv_patch_stack( float* img, float* kern, float* out,
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
...@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -368,7 +374,7 @@ conv_patch_stack( float* img, float* kern, float* out,
//TODO: inverse the out_row and stack loop to don't load the date as frequently! //TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere? //TODO: do this happen elsewhere?
for(int out_row=ty;out_row<out_len_max;out_row+=blockDim.y){ for(;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory //TODO: load only the part of the image needed or put the partial result in shared memory
...@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -397,7 +403,11 @@ conv_patch_stack( float* img, float* kern, float* out,
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in;
if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
//if needed as on Fermi as reading out of bound index from shared memory generate an error. //if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
......
...@@ -282,8 +282,7 @@ def get_valid_shapes(): ...@@ -282,8 +282,7 @@ def get_valid_shapes():
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
#test subsample #test subsample done in a separate fct
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
shapes += [ shapes += [
#other test #other test
...@@ -502,8 +501,7 @@ def test_full(): ...@@ -502,8 +501,7 @@ def test_full():
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1))
#test subsample #test subsample done in a separate fct
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
shapes += [ shapes += [
#other test #other test
...@@ -552,22 +550,32 @@ def test_full(): ...@@ -552,22 +550,32 @@ def test_full():
def test_subsample(): def test_subsample():
# implement when # implement when
shapes = [ shapes = [
((1, 1, 1, 1), (1, 1, 1, 1), (1,1)) ((1, 1, 1, 1), (1, 1, 1, 1), (1,1), (1,1), (1,1))
, ((1, 1, 1, 1), (1, 1, 1, 1), (2,2)) , ((1, 1, 1, 1), (1, 1, 1, 1), (2,2), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3)) , ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
] ]
all_good = True shapes += get_shapes2(scales_img=(2,2),subsample=(1,1))
shapes += get_shapes2(scales_img=(2,2),subsample=(1,2))
_params_allgood_header() shapes += get_shapes2(scales_img=(2,2),subsample=(2,1))
for ishape, kshape, ds in shapes: shapes += get_shapes2(scales_img=(2,2),subsample=(2,2))
if not _params_allgood(ishape, kshape, 'full', subsample=ds):
all_good = False #We put only the version that implement the subsample to make the test faster.
if not _params_allgood(ishape, kshape, 'valid', subsample=ds): version_valid = [-2,-1,1,3,11,12]
all_good = False version_full = [-2,-1]
assert all_good verbose = 0
random = True
print_ = False
ones = False
if ones:
random = False
#test
random = False
exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones)
exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones)
## See #616 ## See #616
#def test_logical_shapes(): #def test_logical_shapes():
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论