提交 0d5cffbe authored 作者: Sean Lee's avatar Sean Lee

Force instantiate kernel templates

上级 89f584bc
...@@ -203,11 +203,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -203,11 +203,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \ if(threads.y==out_len) f=conv_patch_2;
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\ else f=conv_patch_3;
else f=conv_patch<true,kern_wid,true>;
CONV_PATCH_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
...@@ -267,41 +264,39 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -267,41 +264,39 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \ if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_64;}
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_65;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_66;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_67;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_68;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_69;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_70;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_71;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_72;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_73;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_74;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_75;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_76;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_77;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_78;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_79;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_80;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_81;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_82;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_83;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_84;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_85;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_86;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_87;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_88;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_89;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_90;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_91;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_92;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_93;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_94;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_95;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
...@@ -380,11 +375,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -380,11 +375,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \ if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_0;
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\ else f = conv_rows_1;
else f = conv_rows<kern_wid, true>;\
CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out), (cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
...@@ -450,10 +443,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -450,10 +443,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n"); //fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>; f = conv_rows_stack_0;
} else { } else {
//fprintf(stderr, "using true version\n"); //fprintf(stderr, "using true version\n");
f = conv_rows_stack<THEANO_KERN_WID, true>; f = conv_rows_stack_1;
} }
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
...@@ -535,13 +528,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -535,13 +528,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \ if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2_1;
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\ else if(version==9) f = conv_rows_stack2_3;
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\ else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2_0;
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\ else f = conv_rows_stack2_2;
else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img), (cuda_get_ptr(img),
...@@ -663,24 +653,23 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -663,24 +653,23 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern); //printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel //We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\ /* if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_0;*/
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\ /*else*/ if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_1;
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\ else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_2;
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\ else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_3;
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\ /*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_4;*/
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\ else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_5;
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\ else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_6;
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\ else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_7;
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\ /*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_8;*/
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\ else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_9;
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\ else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_10;
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\ else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_11;
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\ /*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_12;*/
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\ else if(kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_13;
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\ else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_14;
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_15;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out), f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
...@@ -770,9 +759,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img, ...@@ -770,9 +759,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//std::cerr << "kerns " << nstack << " " << kern_len << "\n"; //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n"; //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack) if(block_nstack==nstack)
f=conv_valid_row_reduce<false>; f=conv_valid_row_reduce_0;
else else
f=conv_valid_row_reduce<true>; f=conv_valid_row_reduce_1;
f<<<n_blocks, n_threads, n_reduce_buf>>>( f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, PyGpuArray_DIMS(img)[1], nbatch, nkern, PyGpuArray_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
...@@ -1105,23 +1094,20 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1105,23 +1094,20 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \ if(version==3) f=conv_full_patch_stack_padded_0;
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\ else if(version==5) f=conv_full_patch_stack_padded_1;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\ else if(version==4) f=conv_full_patch_stack_padded_2;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded_4;
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded_5;
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded_6;
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\ else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded_8;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\ else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded_9;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\ else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded_10;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded_12;
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded_13;
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded_14;
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false); else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out), (cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
...@@ -1225,9 +1211,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1225,9 +1211,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//typeof(conv_full_load_everything<0>) f = ; //typeof(conv_full_load_everything<0>) f = ;
void (*f)(const float*, const float*, float*, void (*f)(const float*, const float*, float*,
int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>; int, int, int, int, int, int, int, int) = conv_full_load_everything;
f = conv_full_load_everything<THEANO_KERN_WID>;
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), (cuda_get_ptr(img),
...@@ -1284,10 +1268,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern, ...@@ -1284,10 +1268,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
int, int, int, int, int, int, int, int,
int, int, int, int); int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\ if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_0;
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\ else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_1;
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\ else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_2;
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>; else if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_3;
f<<< grid, threads, shared_size>>>( f<<< grid, threads, shared_size>>>(
cuda_get_ptr(img), cuda_get_ptr(img),
......
...@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out, ...@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out,
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions. //template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out, conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
...@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/** /**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory. * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency. * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
...@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers. * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version! * if you set split and low_mem to true, we will use the low_mem version!
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__global__ void __device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out, conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
...@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
} }
if(out_row<out_len) if(out_row<out_len)
...@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{ {
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ; return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
...@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{ {
return x[0] * y[0]; return x[0] * y[0];
} }
template<int NSTACK>
__global__ void __global__ void
conv_full_load_everything( const float* img, const float* kern, float* out, conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
...@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
{ {
int icol = out_col - kern_wid+1+col; int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue; if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0) if (THEANO_KERN_WID > 0)
{ {
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid, sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid); d_kern + row*kern_wid+col, kern_len*kern_wid);
} }
else else
......
...@@ -182,12 +182,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data, ...@@ -182,12 +182,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data,
return 0; return 0;
} }
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum, __device__ void convolutionRowNoFlip(float& sum,
const float *data, const float *data,
const float *kern, const int kern_wid){ const float *kern, const int kern_wid){
if(KERN_WIDTH>0) if(THEANO_KERN_WID>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern); sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
else else
#pragma unroll 8 #pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col for (int col=0; col < kern_wid; col++) {//loop over col
...@@ -219,8 +218,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -219,8 +218,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){
* When true, allow for output image bigger then 512 pixel. * When true, allow for output image bigger then 512 pixel.
* Use more registers. * Use more registers.
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool split> template<bool flipped_kern, bool split>
__global__ void __device__ inline void
conv_patch( const float* img, const float* kern, float* out, conv_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
...@@ -260,7 +259,7 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -260,7 +259,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image blockIdx.y*out_wid*out_len+//the output image
...@@ -271,7 +270,7 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -271,7 +270,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
...@@ -280,6 +279,24 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -280,6 +279,24 @@ conv_patch( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, kern, out, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
__INSTANTIATE_CONV_PATCH(2, true, false)
__INSTANTIATE_CONV_PATCH(3, true, true)
#undef __INSTANTIATE_CONV_PATCH
}
/** /**
* As conv_patch, but implement the stack in the kernel. * As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency. * I keep it separated from conv_patch as we take more registers and this could lower the occupency.
...@@ -295,15 +312,14 @@ conv_patch( const float* img, const float* kern, float* out, ...@@ -295,15 +312,14 @@ conv_patch( const float* img, const float* kern, float* out,
* dy: patch stride cols(1 for normal convolution) * dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't * template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result * template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous * template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous * template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more than 1 output pixel, but use more registers. * template split: if true, each thread generate more than 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time. * template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1. * template subsample: if false, remove some computation needed when dx or dy!=1.
*/ */
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __device__ inline void
conv_patch_stack( const float* img, const float* kern, float* out, conv_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
...@@ -365,7 +381,7 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -365,7 +381,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
else else
idx_in=&d_img[(row+out_row)*img_wid+out_col]; idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
...@@ -425,7 +441,7 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -425,7 +441,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len) if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
...@@ -440,6 +456,63 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -440,6 +456,63 @@ conv_patch_stack( const float* img, const float* kern, float* out,
} }
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern, dx, dy); \
}
__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK
}
/** /**
* As conv_patch_stack, but kern_len thread for each output pixel * As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register. * I keep it separated as use more register.
...@@ -454,8 +527,8 @@ conv_patch_stack( const float* img, const float* kern, float* out, ...@@ -454,8 +527,8 @@ conv_patch_stack( const float* img, const float* kern, float* out,
* template img_contiguous: if true, the img have are collon and row contiguous * template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time * template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void __device__ inline void
conv_patch_stack_reduce( const float* img, const float* kern, float* out, conv_patch_stack_reduce( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
...@@ -521,7 +594,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -521,7 +594,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(tz<len3) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
...@@ -531,7 +604,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -531,7 +604,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
}else{ }else{
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
...@@ -540,7 +613,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -540,7 +613,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
...@@ -559,6 +632,45 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -559,6 +632,45 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(0, false, false, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(4, false, true, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory * we store kern_len row of the image and the full kernel in the shared memory
...@@ -570,8 +682,8 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out, ...@@ -570,8 +682,8 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory. * Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>. * I.E. work for bigger image then conv_patch<split=true,...>.
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<bool c_contiguous>
__global__ void __device__ inline void
conv_rows( const float* img, const float* kern, float* out, conv_rows( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
...@@ -612,7 +724,7 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -612,7 +724,7 @@ conv_rows( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col]; const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -620,6 +732,31 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -620,6 +732,31 @@ conv_rows( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS(0, false)
__INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register. * as conv_rows, but implement the stack. Separate as this use more register.
...@@ -631,8 +768,8 @@ conv_rows( const float* img, const float* kern, float* out, ...@@ -631,8 +768,8 @@ conv_rows( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory. * Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>. * I.E. work for bigger image then conv_patch<split=true,...>.
*/ */
template<int KERN_WIDTH, bool c_contiguous> template<bool c_contiguous>
__global__ void __device__ inline void
conv_rows_stack( const float* img, const float* kern, float* out, conv_rows_stack( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
...@@ -708,7 +845,7 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -708,7 +845,7 @@ conv_rows_stack( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads();//to be sure all thread have finished before we modif the shared memory. __syncthreads();//to be sure all thread have finished before we modif the shared memory.
} }
...@@ -718,6 +855,33 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -718,6 +855,33 @@ conv_rows_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK(0, false)
__INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/** /**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY * WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row. * as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
...@@ -729,8 +893,8 @@ conv_rows_stack( const float* img, const float* kern, float* out, ...@@ -729,8 +893,8 @@ conv_rows_stack( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image and kernel in the shared memory. * Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>. * I.E. work for bigger image then conv_patch<split=true,...>.
*/ */
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern> template<bool c_contiguous, bool preload_full_kern>
__global__ void __device__ inline void
conv_rows_stack2(const float* img, const float* kern, float* out, conv_rows_stack2(const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
...@@ -804,7 +968,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -804,7 +968,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col]; const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f; float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid); convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission. sum+=sum_;//We pass by an intermediate variable to have more precission.
} }
} }
...@@ -816,6 +980,35 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -816,6 +980,35 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, \
kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/** /**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the * Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel. * kernel-image inner-product in parallel.
...@@ -826,7 +1019,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out, ...@@ -826,7 +1019,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
* TODO: explain parameters, preconditions * TODO: explain parameters, preconditions
*/ */
template<bool stack_loop> template<bool stack_loop>
__global__ void __device__ inline void
conv_valid_row_reduce(int nB, int nK, int stacklen, conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int kern_len, int kern_wid,
...@@ -911,6 +1104,32 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -911,6 +1104,32 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
__global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
}
/** /**
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论