提交 0d5cffbe authored 作者: Sean Lee's avatar Sean Lee

Force instantiate kernel templates

上级 89f584bc
......@@ -203,11 +203,8 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int,
int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>;
CONV_PATCH_SPECIAL(THEANO_KERN_WID);
if(threads.y==out_len) f=conv_patch_2;
else f=conv_patch_3;
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
......@@ -267,41 +264,39 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int,
int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_64;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_65;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_66;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_67;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_68;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_69;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_70;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_71;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_72;}
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_73;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_74;}
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_75;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_76;}
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_77;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_78;}
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_79;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_80;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_81;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_82;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_83;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_84;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_85;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack_86;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack_87;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_88;}
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_89;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_90;}
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_91;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_92;}
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_93;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack_94;}
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack_95;}
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid,
......@@ -380,11 +375,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_0;
else f = conv_rows_1;
CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img), cuda_get_ptr(kern), cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
......@@ -450,10 +443,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>;
f = conv_rows_stack_0;
} else {
//fprintf(stderr, "using true version\n");
f = conv_rows_stack<THEANO_KERN_WID, true>;
f = conv_rows_stack_1;
}
f<<< grid, threads, shared_size >>>
......@@ -535,13 +528,10 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2_1;
else if(version==9) f = conv_rows_stack2_3;
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2_0;
else f = conv_rows_stack2_2;
f<<< grid, threads, shared_size >>>
(cuda_get_ptr(img),
......@@ -663,24 +653,23 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
/* if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_0;*/
/*else*/ if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_1;
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_2;
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_3;
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_4;*/
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_5;
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_6;
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_7;
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce_8;*/
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce_9;
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce_10;
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce_11;
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce_12;*/
else if(kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce_13;
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce_14;
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce_15;
f<<< grid, threads, shared_size>>>(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid,
......@@ -770,9 +759,9 @@ PyGpuArray_conv_valid(const PyGpuArrayObject *img,
//std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack)
f=conv_valid_row_reduce<false>;
f=conv_valid_row_reduce_0;
else
f=conv_valid_row_reduce<true>;
f=conv_valid_row_reduce_1;
f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, PyGpuArray_DIMS(img)[1],
img_len, img_wid,
......@@ -1105,23 +1094,20 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
int, int, int, int,
int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
if(version==3) f=conv_full_patch_stack_padded_0;
else if(version==5) f=conv_full_patch_stack_padded_1;
else if(version==4) f=conv_full_patch_stack_padded_2;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded_4;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded_5;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded_6;
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded_8;
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded_9;
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded_10;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded_12;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded_13;
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded_14;
else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img), kern_data_unflipped, cuda_get_ptr(out),
img_len, img_wid, kern_len, kern_wid, nkern, nstack,
......@@ -1225,9 +1211,7 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
//typeof(conv_full_load_everything<0>) f = ;
void (*f)(const float*, const float*, float*,
int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
f = conv_full_load_everything<THEANO_KERN_WID>;
int, int, int, int, int, int, int, int) = conv_full_load_everything;
f<<< grid, threads, shared_size>>>
(cuda_get_ptr(img),
......@@ -1284,10 +1268,10 @@ PyGpuArray_conv_full(const PyGpuArrayObject *img, const PyGpuArrayObject * kern,
int, int, int, int,
int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<false,true>;\
else if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<false,false>;
if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_0;
else if(!img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_1;
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack_2;
else if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack_3;
f<<< grid, threads, shared_size>>>(
cuda_get_ptr(img),
......
......@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out,
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void
__device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
......@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
......@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
......@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
if(out_row<out_len)
......@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
......@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{
return x[0] * y[0];
}
template<int NSTACK>
__global__ void
conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid,
......@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
{
int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0)
if (THEANO_KERN_WID > 0)
{
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid);
}
else
......
......@@ -182,12 +182,11 @@ template<> __device__ float convolutionRowNoFlip<0>(const float *data,
return 0;
}
template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum,
const float *data,
const float *kern, const int kern_wid){
if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
if(THEANO_KERN_WID>0)
sum+=convolutionRowNoFlip<THEANO_KERN_WID>(data,kern);
else
#pragma unroll 8
for (int col=0; col < kern_wid; col++) {//loop over col
......@@ -219,8 +218,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){
* When true, allow for output image bigger then 512 pixel.
* Use more registers.
*/
template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void
template<bool flipped_kern, bool split>
__device__ inline void
conv_patch( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack)
......@@ -260,7 +259,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image
......@@ -271,7 +270,7 @@ conv_patch( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image
......@@ -280,6 +279,24 @@ conv_patch( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH(suffix, ...) \
__global__ void \
conv_patch_##suffix(const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack) \
{ \
conv_patch<__VA_ARGS__>(img, kern, out, img_len, img_wid, kern_len, \
kern_wid, nkern, nstack); \
}
__INSTANTIATE_CONV_PATCH(2, true, false)
__INSTANTIATE_CONV_PATCH(3, true, true)
#undef __INSTANTIATE_CONV_PATCH
}
/**
* As conv_patch, but implement the stack in the kernel.
* I keep it separated from conv_patch as we take more registers and this could lower the occupency.
......@@ -295,15 +312,14 @@ conv_patch( const float* img, const float* kern, float* out,
* dy: patch stride cols(1 for normal convolution)
* template flipped_kern: if true, we "flip" the kernel as in a real convolution, else we don't
* template accumulate: if true, we add the result, else we override the result
* template KERN_WIDTH: if 0, will work for any kern_wid, else it specialyse to this kern_wid as an optimization
* template img_c_contiguous_2d: if true, the img have are collon and row contiguous
* template kern_c_contiguous_2d: if true, the kernel have are collon and row contiguous
* template split: if true, each thread generate more than 1 output pixel, but use more registers.
* template preload_full_kern: if true, we load the full kernel in shared memory, else, we load 1 row at a time.
* template subsample: if false, remove some computation needed when dx or dy!=1.
*/
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void
template<bool flipped_kern, bool accumulate, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__device__ inline void
conv_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid,
......@@ -365,7 +381,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
else
idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
......@@ -425,7 +441,7 @@ conv_patch_stack( const float* img, const float* kern, float* out,
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
......@@ -440,6 +456,63 @@ conv_patch_stack( const float* img, const float* kern, float* out,
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK(suffix, ...) \
__global__ void \
conv_patch_stack_##suffix(const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int out_len, int out_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern, \
int dx, int dy) \
{ \
conv_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, out_len, \
out_wid, nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern, dx, dy); \
}
__INSTANTIATE_CONV_PATCH_STACK(64, true, false, false, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(65, true, false, false, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(66, true, false, false, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(67, true, false, false, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(68, true, false, false, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(69, true, false, false, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(70, true, false, false, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(71, true, false, false, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(72, true, false, false, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(73, true, false, false, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(74, true, false, false, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(75, true, false, false, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(76, true, false, false, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(77, true, false, false, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(78, true, false, false, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(79, true, false, false, true, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(80, true, false, true, false, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(81, true, false, true, false, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(82, true, false, true, false, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(83, true, false, true, false, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(84, true, false, true, false, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(85, true, false, true, false, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(86, true, false, true, false, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(87, true, false, true, false, true, true, true)
__INSTANTIATE_CONV_PATCH_STACK(88, true, false, true, true, false, false, false)
__INSTANTIATE_CONV_PATCH_STACK(89, true, false, true, true, false, false, true)
__INSTANTIATE_CONV_PATCH_STACK(90, true, false, true, true, false, true, false)
__INSTANTIATE_CONV_PATCH_STACK(91, true, false, true, true, false, true, true)
__INSTANTIATE_CONV_PATCH_STACK(92, true, false, true, true, true, false, false)
__INSTANTIATE_CONV_PATCH_STACK(93, true, false, true, true, true, false, true)
__INSTANTIATE_CONV_PATCH_STACK(94, true, false, true, true, true, true, false)
__INSTANTIATE_CONV_PATCH_STACK(95, true, false, true, true, true, true, true)
#undef __INSTANTIATE_CONV_PATCH_STACK
}
/**
* As conv_patch_stack, but kern_len thread for each output pixel
* I keep it separated as use more register.
......@@ -454,8 +527,8 @@ conv_patch_stack( const float* img, const float* kern, float* out,
* template img_contiguous: if true, the img have are collon and row contiguous
* template preload_full_kern: work only when split is true. We don't load the full kernel at once, but we load ceil_intdiv(kern_len/nb_split) kernel row at a time
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void
template<bool flipped_kern, bool c_contiguous, bool split, bool preload_full_kern>
__device__ inline void
conv_patch_stack_reduce( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row,
......@@ -521,7 +594,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0;
if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2;
}
}else if(split){
......@@ -531,7 +604,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
}else{
int row = tz;//The row of the kernel.
......@@ -540,7 +613,7 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
......@@ -559,6 +632,45 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_PATCH_STACK_REDUCE(suffix, ...) \
__global__ void \
conv_patch_stack_reduce_##suffix( \
const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_patch_stack_reduce<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(0, false, false, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(1, false, false, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(2, false, false, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(3, false, false, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(4, false, true, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(5, false, true, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(6, false, true, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(7, false, true, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(8, true, false, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(9, true, false, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(10, true, false, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(11, true, false, true, true);
/*__INSTANTIATE_CONV_PATCH_STACK_REDUCE(12, true, true, false, false);*/
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(13, true, true, false, true);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(14, true, true, true, false);
__INSTANTIATE_CONV_PATCH_STACK_REDUCE(15, true, true, true, true);
#undef __INSTANTIATE_CONV_PATCH_STACK_REDUCE
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* we store kern_len row of the image and the full kernel in the shared memory
......@@ -570,8 +682,8 @@ conv_patch_stack_reduce( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
template<bool c_contiguous>
__device__ inline void
conv_rows( const float* img, const float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack,
......@@ -612,7 +724,7 @@ conv_rows( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -620,6 +732,31 @@ conv_rows( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS(suffix, ...) \
__global__ void \
conv_rows_##suffix(const float *img, const float *kern, float *out, \
int img_len, int img_wid, int kern_len, int kern_wid, \
int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int img_stride_stack, int img_stride_batch, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_rows<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS(0, false)
__INSTANTIATE_CONV_ROWS(1, true)
#undef __INSTANTIATE_CONV_ROWS
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows, but implement the stack. Separate as this use more register.
......@@ -631,8 +768,8 @@ conv_rows( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous>
__global__ void
template<bool c_contiguous>
__device__ inline void
conv_rows_stack( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
......@@ -708,7 +845,7 @@ conv_rows_stack( const float* img, const float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads();//to be sure all thread have finished before we modif the shared memory.
}
......@@ -718,6 +855,33 @@ conv_rows_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK(suffix, ...) \
__global__ void \
conv_rows_stack_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, \
nkern, nstack, img_stride_col, img_stride_row, \
img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK(0, false)
__INSTANTIATE_CONV_ROWS_STACK(1, true)
#undef __INSTANTIATE_CONV_ROWS_STACK
}
/**
* WORK FOR IMAGE THAT DON'T FIT IN SHARED MEMORY
* as conv_rows_stack, but load only block_len of the image at a time and 1 or all kern row.
......@@ -729,8 +893,8 @@ conv_rows_stack( const float* img, const float* kern, float* out,
* Diff with conv_patch: don't store the full image and kernel in the shared memory.
* I.E. work for bigger image then conv_patch<split=true,...>.
*/
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void
template<bool c_contiguous, bool preload_full_kern>
__device__ inline void
conv_rows_stack2(const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack,
......@@ -804,7 +968,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
convolutionRowNoFlip(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission.
}
}
......@@ -816,6 +980,35 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_ROWS_STACK2(suffix, ...) \
__global__ void \
conv_rows_stack2_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_rows_stack2<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, \
kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_ROWS_STACK2(0, false, false)
__INSTANTIATE_CONV_ROWS_STACK2(1, false, true)
__INSTANTIATE_CONV_ROWS_STACK2(2, true, false)
__INSTANTIATE_CONV_ROWS_STACK2(3, true, true)
#undef __INSTANTIATE_CONV_ROWS_STACK2
}
/**
* Implementation of 'valid' mode convolution that uses one block per output pixel, and uses a sum-reduce within each block to compute the
* kernel-image inner-product in parallel.
......@@ -826,7 +1019,7 @@ conv_rows_stack2(const float* img, const float* kern, float* out,
* TODO: explain parameters, preconditions
*/
template<bool stack_loop>
__global__ void
__device__ inline void
conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_len, int img_wid,
int kern_len, int kern_wid,
......@@ -911,6 +1104,32 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
}
}
extern "C" {
#define __INSTANTIATE_CONV_VALID_ROW_REDUCE(suffix, ...) \
__global__ void \
conv_valid_row_reduce_##suffix( \
int nB, int nK, int stacklen, int img_len, int img_wid, \
int kern_len, int kern_wid, int out_len, int out_wid, \
const float *img, int img_str_B, int img_str_S, int img_str_R, int img_str_C, \
const float *kern, int kern_str_K, int kern_str_S, int kern_str_R, int kern_str_C, \
float *out, int out_str_B, int out_str_K, int out_str_R, int out_str_C, \
int subsample_rows, int subsample_cols, \
const int initial_reduce_boundary) \
{ \
conv_valid_row_reduce<__VA_ARGS__>( \
nB, nK, stacklen, img_len, img_wid, \
kern_len, kern_wid, out_len, out_wid, \
img, img_str_B, img_str_S, img_str_R, img_str_C, \
kern, kern_str_K, kern_str_S, kern_str_R, kern_str_C, \
out, out_str_B, out_str_K, out_str_R, out_str_C, \
subsample_rows, subsample_cols, initial_reduce_boundary); \
}
__INSTANTIATE_CONV_VALID_ROW_REDUCE(0, false)
__INSTANTIATE_CONV_VALID_ROW_REDUCE(1, true)
#undef __INSTANTIATE_CONV_VALID_ROW_REDUCE
}
/**
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论