提交 7b68154f authored 作者: Frederic's avatar Frederic

change tab to space

上级 7bec4ff8
...@@ -9,8 +9,8 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * ...@@ -9,8 +9,8 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
*/ */
int int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols, CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0) int version = -1, int verbose=0)
{ {
int work_complete = 0; int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
...@@ -33,28 +33,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -33,28 +33,28 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose>1) if (verbose>1)
{ {
fprintf(stderr, fprintf(stderr,
"INFO: Running conv_valid version=%d," "INFO: Running conv_valid version=%d,"
" MACRO kern_width=%d with inputs:\n", " MACRO kern_width=%d with inputs:\n",
version, THEANO_KERN_WID); version, THEANO_KERN_WID);
fprintf(stderr, fprintf(stderr,
"INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[3]);
fprintf(stderr, fprintf(stderr,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", "INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_STRIDES(kern)[3]);
fprintf(stderr, fprintf(stderr,
"INFO: subsample_rows=%d, subsample_cols=%d\n", "INFO: subsample_rows=%d, subsample_cols=%d\n",
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
} }
//Check the output size is valid //Check the output size is valid
...@@ -99,8 +99,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -99,8 +99,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
const int out_size_byte = out_size*sizeof(float); const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %d columns!", " %d kernel columns, but the kernel we received had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1; return -1;
} }
...@@ -136,222 +136,222 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -136,222 +136,222 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//img_size_byte+kern_size_byte>8*1024, we can enter in condition where //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers. //we will lower the occupency due to shared memory and/or registers.
if ((version == -1) && if ((version == -1) &&
(out_size<64 || img_size_byte+kern_size_byte>8*1024) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
out_size<=256){ out_size<=256){
//condition for exec //condition for exec
if(!subsample && if(!subsample &&
out_contiguous && out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce! std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) !work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed. version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
} }
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) && (version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch !work_complete) //conv_patch
{ {
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing. if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern); dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \ #define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\ if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>; else f=conv_patch<true,kern_wid,true>;
CONV_PATCH_SPECIAL(THEANO_KERN_WID); CONV_PATCH_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata, (img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack); img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n", "INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split); threads.y==out_len ? "no split": "split", nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n", " shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split); shared_size, threads.x * threads.y, nb_split);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s)," "INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (out_contiguous && if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) && (version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<512) &&//Maximum of 512 theads by block (version!=1 || out_size<512) &&//Maximum of 512 theads by block
out_wid<512 &&//Maximum of 512 theads by block out_wid<512 &&//Maximum of 512 theads by block
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack !work_complete) //conv_patch_stack
{ {
//version 1 is without split and preload the full kernel //version 1 is without split and preload the full kernel
//version 3 is with split and preload the full kernel //version 3 is with split and preload the full kernel
//version 11 is without split and load only 1 kernel row at a time. //version 11 is without split and load only 1 kernel row at a time.
//version 12 is with split and load only 1 kernel row at a time. //version 12 is with split and load only 1 kernel row at a time.
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing. if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail; bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false; if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float); int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \ #define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \ if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;} else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata, (img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
out_len, out_wid, nkern, nstack, out_len, out_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row, img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols); kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i," " shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i," " kern_flipped=true, accumulate=false, kern_width=%i,"
" img_c_contiguous_2d=%i," " img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i,", " preload_full_kernel=%i,",
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i" "INFO: used 'conv_patch_stack' version with nb_split=%i"
" and preload_full_kernel=%i," " and preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i," " shared_size=%i, nb_threads=%i,"
" kern_flipped=true, accumulate=false," " kern_flipped=true, accumulate=false,"
" kern_width=%i, img_c_contiguous_2d=%i," " kern_width=%i, img_c_contiguous_2d=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i," " kern_c_contiguous_2d=%i, nb_split=%i,"
" preload_full_kernel=%i," " preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " subsample_rows=%i, subsample_cols=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel, nb_split, preload_full_kernel,
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s)," "INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==4||version==-1) && (version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<512 &&//Maximum of 512 threads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows !work_complete) //conv_rows
{ {
dim3 threads(out_wid); dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern); dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float); int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \ #define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\ if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\ else f = conv_rows<kern_wid, true>;\
CONV_ROWS_SPECIAL(THEANO_KERN_WID); CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, kern->devdata, out->devdata, (img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
...@@ -359,58 +359,58 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -359,58 +359,58 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_rows' version\n"); fprintf(stderr, "INFO: used 'conv_rows' version\n");
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y); shared_size, threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s)," "INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==5||version==-1) && (version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block out_wid<512 &&//Maximum of 512 theads by block
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack !work_complete) //conv_rows_stack
{ {
int nb_row=1; int nb_row=1;
int max_threads=512; int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 //TODO:if not c_contiguous, lower max_thread as we use 22
//registers by thread and we won't execute 2 block in one MP. //registers by thread and we won't execute 2 block in one MP.
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail) if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
dim3 threads(out_wid,nb_row); dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
if (0) if (0)
fprintf(stderr, fprintf(stderr,
"IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n", "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
img_contiguous_2d, kern_contiguous_2d, img_contiguous_2d, kern_contiguous_2d,
threads.x, threads.y, threads.z, threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z); grid.x, grid.y, grid.z);
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n"); //fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>; f = conv_rows_stack<THEANO_KERN_WID, false>;
} else { } else {
...@@ -418,136 +418,136 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -418,136 +418,136 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
f = conv_rows_stack<THEANO_KERN_WID, true>; f = conv_rows_stack<THEANO_KERN_WID, true>;
} }
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y); shared_size, threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y); shared_size, threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s)," "INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) && (version==9||version==10||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<512 &&//Maximum of 512 threads by block
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2 !work_complete) //conv_rows_stack2
{ {
// version 9:we preload the full kernel // version 9:we preload the full kernel
// version 10: load only a few row at a time. // version 10: load only a few row at a time.
int nb_row=1; int nb_row=1;
int max_threads=512; int max_threads=512;
int version_back = version; int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP. //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
version = 9; version = 9;
else if(version==-1)version = 10; else if(version==-1)version = 10;
int k_size = kern_size; int k_size = kern_size;
if(version==10) if(version==10)
k_size=kern_wid; k_size=kern_wid;
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail) if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
//to test the case when we don't have a thread by output pixel. //to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--; if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row); dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=(threads.y*img_wid + k_size)*sizeof(float); int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \ #define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\ if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\ else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\ else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>; else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID); CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y); shared_size, threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with" "INFO: used 'conv_rows_stack2' version %s with"
" %d row(s).\n", " %d row(s).\n",
(version==9?"'load full kernel'": (version==9?"'load full kernel'":
"'load 1 kern row at a time'"),nb_row); "'load 1 kern row at a time'"),nb_row);
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i version=%d\n", " shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3)); shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s)," "INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -559,24 +559,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -559,24 +559,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//version 8 is needed to test more easily this kernel template parameter. //version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time. //version 13 load only 1 kernel row at a time.
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
(version==7||version==8||version==13||version==-1) && (version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split. (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split. //version 13 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && (version!=13||kern_len>1) &&
!work_complete) //conv_patch_stack_reduce !work_complete) //conv_patch_stack_reduce
{ {
int nb_split=1; int nb_split=1;
int full_kern=true; int full_kern=true;
if(version==8||version==13) nb_split++;//force the split. if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false; if(version==13)full_kern=false;
//check if we can fit the full kernel in the shared memory //check if we can fit the full kernel in the shared memory
if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){ if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
full_kern = false; full_kern = false;
} }
//thread_z is going to be ceil_intdiv(kern_len, nb_split) //thread_z is going to be ceil_intdiv(kern_len, nb_split)
// we need enough splits so that // we need enough splits so that
...@@ -586,24 +586,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -586,24 +586,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
// constraint (a) // constraint (a)
// device 1.3 have a max of 64 thread in z // device 1.3 have a max of 64 thread in z
while(ceil_intdiv(kern_len,nb_split)>64) nb_split++; while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
// constraint (b) // constraint (b)
// (TODO: read the number of threads per block from the device) // (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++; while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
// tentative estimates (prior to contraint c) // tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split); int thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern int shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c) // constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){ while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more. //if we can't fit the kernel in shared memory, we must split it more.
nb_split++; nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split); thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
} }
...@@ -647,17 +647,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -647,17 +647,17 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: using 'conv_patch_stack_reduce' version" "INFO: using 'conv_patch_stack_reduce' version"
" kern_flipped=%i ccontig=%i nb_split=%d," " kern_flipped=%i ccontig=%i nb_split=%d,"
" preload_full_kern=%d\n", " preload_full_kern=%d\n",
kern_flipped, ccontig, nb_split, full_kern); kern_flipped, ccontig, nb_split, full_kern);
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i," "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
" grid.y=%i, shared_size=%i, nb_threads=%i\n", " grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y, threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z); shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata, f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
...@@ -673,35 +673,35 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -673,35 +673,35 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i," " grid.x=%i, grid.y=%i,shared_size=%i,"
" nb_threads=%i\n", " nb_threads=%i\n",
threads.x, threads.y, threads.z, threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size, grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z); threads.x * threads.y * threads.z);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s)," "INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} // else no good nb_splits was found } // else no good nb_splits was found
} }
if (1 && (version==6||version==-1) && if (1 && (version==6||version==-1) &&
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int block_nstack=nstack; int block_nstack=nstack;
//Max of 512 threads per blocks. //Max of 512 threads per blocks.
//On old hardware, we have a max of 356 threads as we have only //On old hardware, we have a max of 356 threads as we have only
//8k registers and the kernel use 23 register //8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register... //TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--; while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1); dim3 n_threads(block_nstack, kern_len, 1);
int n_reduce_buf = block_nstack * kern_len * sizeof(float); int n_reduce_buf = block_nstack * kern_len * sizeof(float);
...@@ -722,21 +722,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -722,21 +722,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
void (*f)(int, int, int, int, void (*f)(int, int, int, int,
int, int, int, int, int, int, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
int, int, int); int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n"; //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n"; //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n"; //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack) if(block_nstack==nstack)
f=conv_valid_row_reduce<false>; f=conv_valid_row_reduce<false>;
else else
f=conv_valid_row_reduce<true>; f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>( f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
...@@ -759,21 +759,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -759,21 +759,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n"); fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i," "threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks, n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y); n_reduce_buf, n_threads.x * n_threads.y);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s)," "INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
...@@ -782,47 +782,47 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -782,47 +782,47 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK); NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (1) if (1)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n"); fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose>1) if (verbose>1)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid, nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose>1) if (verbose>1)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
nkern, nstack, kern_len, kern_wid, nkern, nstack, kern_len, kern_wid,
kern->devdata, kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_STRIDES(kern)[3]);
if (verbose>1) if (verbose>1)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid, CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose>1) if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks, n_threads);
} }
conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern, conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
out_len, out_wid, out_len, out_wid,
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3], CudaNdarray_HOST_STRIDES(img)[3],
...@@ -834,7 +834,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -834,7 +834,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -844,33 +844,33 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -844,33 +844,33 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
work_complete = true; work_complete = true;
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_valid' version\n"); fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
} }
else else
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)", " CudaNdarray_conv_valid! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
return -1; return -1;
} }
} }
if (!work_complete) if (!work_complete)
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: no implementation(s) worked for" "ERROR: no implementation(s) worked for"
" CudaNdarray_conv_valid!" " CudaNdarray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)", " Version asked(%d) (-1 mean use an heuristic)",
version); version);
return -1; return -1;
} }
return 0; return 0;
} }
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0) int subsample_cols, int version = -1, int verbose=0)
{ {
//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150; const int shared_avail = SHARED_SIZE - 150;
...@@ -932,12 +932,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -932,12 +932,12 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
//const int out_size_byte = out_size*sizeof(float); // unused //const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
(THEANO_KERN_WID == 0))){ (THEANO_KERN_WID == 0))){
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"ERROR: This GpuConv code was compiled for" "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received" " %d kernel columns, but the kernel we received"
" had %d columns!", " had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1; return -1;
} }
bool subsample = subsample_rows!=1 || subsample_cols!=1; bool subsample = subsample_rows!=1 || subsample_cols!=1;
...@@ -974,163 +974,163 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -974,163 +974,163 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose>1) if (verbose>1)
{ {
printf("INFO: Running conv_full version=%d," printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID); " MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_STRIDES(kern)[3]);
} }
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
(version==3||version==4||version==5||version==-1) && (version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<512 &&//Maximum of 512 threads by block
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded !work_complete) //conv_full_patch_stack_padded
{ {
//version 3 without split //version 3 without split
//version 4 with split (more registers) //version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register) //version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing. if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(kern_len==1 && version==5){ if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1. //version 5 don't support kern_len==1 as 1%0 return -1.
version=-1; version=-1;
if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n"); if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
} }
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5; if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple //we pass by ceil_intdiv in case the out_len is not a multiple
//of nb_split, we want nb_split the number of iteration. //of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory //Max of 16k of shared memory
if(version==5) if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++; while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register //327 as we use 25 register
//version 5 will have only 1 block running at a time, so we //version 5 will have only 1 block running at a time, so we
//can use 32 registers per threads, but their is some other stuff that //can use 32 registers per threads, but their is some other stuff that
//for the limit to bu lower then 512. //for the limit to bu lower then 512.
int max_thread = (version!=5?327:450); int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4; if(version==-1 && out_size>512)version=4;
if(version==-1)version=3; if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4; if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3; else if(version==-1) version=3;
//force version 4 when more than 1 split are needed to always execute. //force version 4 when more than 1 split are needed to always execute.
else if(version==3 && nb_split!=1) version=4; else if(version==3 && nb_split!=1) version=4;
assert(version!=3 || nb_split==1); assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1); assert(version!=5 || kern_len>1);
assert(version!=-1); assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=img_size_padded_byte + kern_size_byte; int shared_size=img_size_padded_byte + kern_size_byte;
if(version==5) if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte; shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \ #define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\ if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\ else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\ else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\ else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\ else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\ else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\ else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false); else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID); CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata, (img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped, img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," " grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n", " out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z, threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size, grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z, threads.x * threads.y * threads.z,
out_len, nb_split, version); out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'" "INFO: used 'conv_full_patch_stack_padded'"
" nb_split=%d low_mem=%s\n", " nb_split=%d low_mem=%s\n",
nb_split, (version==5?"true":"false")); nb_split, (version==5?"true":"false"));
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i," "threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i," " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n", " out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z, threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size, grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z, threads.x * threads.y * threads.z,
out_len, nb_split, version); out_len, nb_split, version);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s" "INFO: impl 'conv_full_patch_stack_padded' %s %s"
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
version==3?"no split": "split", version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"), (version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==-1) && (version==0||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch !work_complete) //conv_full_patch
{ {
dim3 threads(out_wid, out_len); dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>> conv_full_patch<<< grid, threads, shared_size>>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack); nkern, nstack);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
...@@ -1142,44 +1142,44 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1142,44 +1142,44 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y); threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s)," "INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) && (version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing. (nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything !work_complete) //conv_full_load_everything
{ {
dim3 threads(out_wid, out_len); dim3 threads(out_wid, out_len);
dim3 grid(nbatch); dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float); int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ; //typeof(conv_full_load_everything<0>) f = ;
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>; int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
f = conv_full_load_everything<THEANO_KERN_WID>; f = conv_full_load_everything<THEANO_KERN_WID>;
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3], CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
...@@ -1200,34 +1200,34 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1200,34 +1200,34 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y); threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'" fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n", " failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (!subsample && if (!subsample &&
img_batch_stack_contiguous && img_batch_stack_contiguous &&
out_contiguous && out_contiguous &&
(version==2||version==-1) && (version==2||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack !work_complete) //conv_full_patch_stack
{ {
dim3 threads(out_wid, out_len); dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int); int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\ if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\ else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
...@@ -1248,20 +1248,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1248,20 +1248,20 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n"); fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y); shared_size, threads.x * threads.y);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
} }
} }
if (1 && !work_complete) //conv_reference_full if (1 && !work_complete) //conv_reference_full
...@@ -1271,60 +1271,60 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1271,60 +1271,60 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK); NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0) if (0)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: launching conv_reference_valid\n"); fprintf(stderr, "INFO: launching conv_reference_valid\n");
if (verbose) if (verbose)
fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[0],
CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[2],
CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[3],
img->devdata, img->devdata,
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose) if (verbose)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[2],
CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata, kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(kern)[3]
); );
if (verbose) if (verbose)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[1],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[2],
CudaNdarray_HOST_DIMS(out)[3], CudaNdarray_HOST_DIMS(out)[3],
out->devdata, out->devdata,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose) if (verbose)
fprintf(stderr, " launch params: %i %i %i\n", fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads); outsize, n_blocks, n_threads);
if (verbose) if (verbose)
fprintf(stderr, " subsample params: %i %i\n", fprintf(stderr, " subsample params: %i %i\n",
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>( conv_reference_full<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3], CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
...@@ -1342,28 +1342,28 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1342,28 +1342,28 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_full' version" fprintf(stderr, "INFO: used 'conv_reference_full' version"
" ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d" " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
" nkern=%d nstack=%d subsample=%d\n", " nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid, img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample); out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i," fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n", " shared_size=%i, nb_threads=%i\n",
n_threads, 1, n_blocks, 1, 0, n_threads); n_threads, 1, n_blocks, 1, 0, n_threads);
if (verbose) if (verbose)
fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s)," fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n", " trying next implementation\n",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for" "ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)", " CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts)); cudaGetErrorString(sts));
return -1; return -1;
} }
} }
return 0; return 0;
...@@ -1371,9 +1371,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -1371,9 +1371,9 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
PyObject * PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode, CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
const int version, const int verbose) const int version, const int verbose)
{ {
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
...@@ -1411,10 +1411,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1411,10 +1411,10 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
if ( out if ( out
&& out->nd==4 && out->nd==4
&& CudaNdarray_is_c_contiguous(out) && CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0] && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1] && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2] && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3]) && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
{ {
rval = out; rval = out;
Py_INCREF(rval); Py_INCREF(rval);
...@@ -1422,9 +1422,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1422,9 +1422,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
else else
{ {
if (out && verbose) if (out && verbose)
fprintf(stderr, fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong" "INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n"); " structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null //rval might be null
} }
......
...@@ -37,20 +37,20 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img ...@@ -37,20 +37,20 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
int img_row = out_row; int img_row = out_row;
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
int inverse_row = (img_row-row); int inverse_row = (img_row-row);
if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
const float* idx_in=&d_img[inverse_row*img_wid]; const float* idx_in=&d_img[inverse_row*img_wid];
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
int img_col = out_col; int img_col = out_col;
int col=0,last=0; int col=0,last=0;
for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
if(last<0 ||last>=(img_wid))continue;//col outside the image if(last<0 ||last>=(img_wid))continue;//col outside the image
sum+=idx_in[last]*idx_kern[col]; sum+=idx_in[last]*idx_kern[col];
} }
} }
out[batch_id*out_len*out_wid+//the output image out[batch_id*out_len*out_wid+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
...@@ -61,8 +61,8 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img ...@@ -61,8 +61,8 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void __global__ void
conv_full_patch( float* img, float* kern, float* out, conv_full_patch( float* img, float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack) int kern_len, int kern_wid, int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -105,12 +105,12 @@ conv_full_patch( float* img, float* kern, float* out, ...@@ -105,12 +105,12 @@ conv_full_patch( float* img, float* kern, float* out,
if(img_col<0){col=-img_col;img_col+=col;} if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col]; sum+=idx_in[col]*idx_kern[col];
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
//we store the full image and the full kernel in the shared memory //we store the full image and the full kernel in the shared memory
...@@ -123,11 +123,11 @@ conv_full_patch( float* img, float* kern, float* out, ...@@ -123,11 +123,11 @@ conv_full_patch( float* img, float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __global__ void
conv_full_patch_stack( float* img, float* kern, float* out, conv_full_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -159,26 +159,26 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -159,26 +159,26 @@ conv_full_patch_stack( float* img, float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue; if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1]; const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
int col=0; int col=0;
int max_col=kern_wid; int max_col=kern_wid;
int img_col=out_col-kern_wid+1; int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col); max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;} if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col]; sum+=idx_in[col]*idx_kern[col];
} }
} }
//Needed as not all thread finish at the same time the loop //Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory. //And we don't want to overwrite the shared memory.
__syncthreads(); __syncthreads();
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -202,13 +202,13 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -202,13 +202,13 @@ conv_full_patch_stack( float* img, float* kern, float* out,
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void __global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out, conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -242,26 +242,26 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -242,26 +242,26 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int out_row = ty;//output row const int out_row = ty;//output row
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack, for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){ img+=img_stride_stack){
__syncthreads(); __syncthreads();
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img, load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
thread_id,nb_thread_id,img_wid,img_len, thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1, img_stride_col, img_stride_row, kern_wid-1,
c_contiguous); c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
}else if(split && !low_mem){ }else if(split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id); fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len. //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
...@@ -269,35 +269,35 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -269,35 +269,35 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
//TODO pass a parameter nb_split //TODO pass a parameter nb_split
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y; out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){ for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
__syncthreads(); __syncthreads();
//TODO: load only the part of the image needed or put the partial result in shared memory //TODO: load only the part of the image needed or put the partial result in shared memory
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1), load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
img+img_stride_stack*stack, img+img_stride_stack*stack,
thread_id,nb_thread_id,img_wid,img_len, thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1, img_stride_col, img_stride_row, kern_wid-1,
c_contiguous); c_contiguous);
load_to_shared(d_kern, kern+kern_stride_stack*stack, load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
//The if is needed as on Fermi as reading out of bound index from shared memory generate an error. //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len) if(out_row<out_len)
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
}else{//low_mem version }else{//low_mem version
//don't need to fill the last rows padding as this is done later. //don't need to fill the last rows padding as this is done later.
...@@ -306,46 +306,46 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -306,46 +306,46 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
__shared__ int out_len_max; __shared__ int out_len_max;
//TODO pass a parameter nb_split //TODO pass a parameter nb_split
if(thread_id==0) if(thread_id==0)
out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows; out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
__syncthreads(); __syncthreads();
for(int out_row = ty, out_row_iter=0;out_row<out_len_max; for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
out_row+=nb_rows, out_row_iter++){ out_row+=nb_rows, out_row_iter++){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
__syncthreads(); __syncthreads();
const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image) const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack. //we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid, load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
img+img_stride_stack*stack//the good stack image img+img_stride_stack*stack//the good stack image
+(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row. +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
thread_id,nb_thread_id,img_wid, thread_id,nb_thread_id,img_wid,
len_to_load+previous_row, len_to_load+previous_row,
img_stride_col, img_stride_row, kern_wid-1, img_stride_col, img_stride_row, kern_wid-1,
c_contiguous); c_contiguous);
//TODO: fill the last row padding only when needed. //TODO: fill the last row padding only when needed.
//We always fill the last rows padding event when not needed. //We always fill the last rows padding event when not needed.
int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load; int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
row_to_fill = min(row_to_fill,kern_len-1); row_to_fill = min(row_to_fill,kern_len-1);
fill(d_img+(kern_len-1+len_to_load)*img_wid_valid, fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id); img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
load_to_shared(d_kern, kern+kern_stride_stack*stack, load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
} }
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
} }
...@@ -366,8 +366,8 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -366,8 +366,8 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
template<int NSTACK> template<int NSTACK>
__global__ void __global__ void
conv_full_load_everything( float* img, float* kern, float* out, conv_full_load_everything( float* img, float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
......
...@@ -20,7 +20,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -20,7 +20,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
const %(type)s* idx_kern=&hvals[j*dim_ker[1]]; const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
int new_n = (pos_n+dim_ker[1]-1); int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
sum+=idx_kern[k]*idx_in[last]; sum+=idx_kern[k]*idx_in[last];
} }
}//for j }//for j
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum; out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
...@@ -99,9 +99,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -99,9 +99,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
* We load from global memory to shared memory. The outer if is optimized away at compilation. * We load from global memory to shared memory. The outer if is optimized away at compilation.
*/ */
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, __device__ void load_to_shared(float * dst, const float * src, const int thread_id,
int nb_thread, const int nb_col, const int nb_row, int nb_thread, const int nb_col, const int nb_row,
const int stride_col, const int stride_row, const int stride_col, const int stride_row,
const bool flipped=false, const bool c_contiguous=true){ const bool flipped=false, const bool c_contiguous=true){
if (c_contiguous) if (c_contiguous)
{ {
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped); load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
...@@ -143,10 +143,10 @@ __device__ void fill(float * dst, int N, float value, int thread_id, int nb_thre ...@@ -143,10 +143,10 @@ __device__ void fill(float * dst, int N, float value, int thread_id, int nb_thre
* We put the image at the center of another one. Usefull to padd an image with 0. * We put the image at the center of another one. Usefull to padd an image with 0.
*/ */
__device__ void load_padded_col_to_shared(float * dst, const float * src, __device__ void load_padded_col_to_shared(float * dst, const float * src,
const int thread_id, const int nb_thread, const int thread_id, const int nb_thread,
const int nb_col, const int nb_row, const int nb_col, const int nb_row,
const int stride_col, const int stride_row, const int stride_col, const int stride_row,
const int wid_pad, const bool c_contiguous=true){ const int wid_pad, const bool c_contiguous=true){
if(c_contiguous){//flipped==false if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){ for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
int col=i%nb_col; int col=i%nb_col;
...@@ -165,24 +165,24 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src, ...@@ -165,24 +165,24 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src,
} }
template<int i> __device__ float convolutionRowNoFlip(const float *data, template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){ const float *kern){
return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ; return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
//return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern); //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
} }
template<> __device__ float convolutionRowNoFlip<1>(const float *data, template<> __device__ float convolutionRowNoFlip<1>(const float *data,
const float *kern){ const float *kern){
return data[0]*kern[0]; return data[0]*kern[0];
} }
template<> __device__ float convolutionRowNoFlip<0>(const float *data, template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){ const float *kern){
return 0; return 0;
} }
template<int KERN_WIDTH> template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum, __device__ void convolutionRowNoFlip(float& sum,
const float *data, const float *data,
const float *kern, const int kern_wid){ const float *kern, const int kern_wid){
if(KERN_WIDTH>0) if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern); sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
else else
...@@ -219,8 +219,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -219,8 +219,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){
template<bool flipped_kern, int KERN_WIDTH, bool split> template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void __global__ void
conv_patch( float* img, float* kern, float* out, conv_patch( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
...@@ -255,24 +255,24 @@ conv_patch( float* img, float* kern, float* out, ...@@ -255,24 +255,24 @@ conv_patch( float* img, float* kern, float* out,
int out_row = ty;//output row int out_row = ty;//output row
float sum = 0.0f; float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image blockIdx.y*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
}else{ }else{
for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){ for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
float sum = 0.0f; float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
} }
...@@ -302,12 +302,12 @@ conv_patch( float* img, float* kern, float* out, ...@@ -302,12 +302,12 @@ conv_patch( float* img, float* kern, float* out,
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __global__ void
conv_patch_stack( float* img, float* kern, float* out, conv_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy) int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{ {
int __shared__ nb_thread_id; int __shared__ nb_thread_id;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -333,43 +333,43 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -333,43 +333,43 @@ conv_patch_stack( float* img, float* kern, float* out,
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack, for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){ img+=img_stride_stack){
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len, load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d); img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern) if(preload_full_kern)
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){ if(!preload_full_kern){
__syncthreads(); __syncthreads();
int idx2; int idx2;
if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row; if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
else idx2=(row)*kern_stride_row; else idx2=(row)*kern_stride_row;
load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1, load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads(); __syncthreads();
} }
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in; const float* idx_in;
if(subsample) if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy]; idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else else
idx_in=&d_img[(row+out_row)*img_wid+out_col]; idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
store_or_accumulate<accumulate>( store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum); out_row*out_wid+out_col],sum);
}else{ }else{
float __shared__ *kern_, *img_; float __shared__ *kern_, *img_;
...@@ -383,54 +383,54 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -383,54 +383,54 @@ conv_patch_stack( float* img, float* kern, float* out,
//TODO: inverse the out_row and stack loop to don't load the date as frequently! //TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere? //TODO: do this happen elsewhere?
for(;out_row<out_len_max;out_row+=blockDim.y){ for(;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory //TODO: load only the part of the image needed or put the partial result in shared memory
int idx1=img_stride_stack*stack; int idx1=img_stride_stack*stack;
load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len, load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d); img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern){ if(preload_full_kern){
int idx2=kern_stride_stack*stack; int idx2=kern_stride_stack*stack;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
} }
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){ if(!preload_full_kern){
__syncthreads(); __syncthreads();
int idx2=kern_stride_stack*stack; int idx2=kern_stride_stack*stack;
if(flipped_kern) if(flipped_kern)
idx2+=(kern_len-row-1)*kern_stride_row; idx2+=(kern_len-row-1)*kern_stride_row;
else else
idx2+=(row)*kern_stride_row; idx2+=(row)*kern_stride_row;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1, load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads(); __syncthreads();
} }
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in; const float* idx_in;
if(subsample) if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy]; idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else else
idx_in=&d_img[(row+out_row)*img_wid+out_col]; idx_in=&d_img[(row+out_row)*img_wid+out_col];
//if needed as on Fermi as reading out of bound index from shared memory generate an error. //if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len) if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
if(out_row<out_len) if(out_row<out_len)
store_or_accumulate<accumulate>( store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum); out_row*out_wid+out_col],sum);
} }
} }
...@@ -454,11 +454,11 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -454,11 +454,11 @@ conv_patch_stack( float* img, float* kern, float* out,
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void __global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out, conv_patch_stack_reduce( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
//int __shared__ out_len, out_wid, nb_thread_id; //int __shared__ out_len, out_wid, nb_thread_id;
//out_len = img_len - kern_len + 1; //out_len = img_len - kern_len + 1;
...@@ -496,12 +496,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -496,12 +496,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
img+=img_stride_batch*batch_id;//the good batch img+=img_stride_batch*batch_id;//the good batch
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack, for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){ img+=img_stride_stack){
__syncthreads(); __syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len, load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(split && ! preload_full_kern){ if(split && ! preload_full_kern){
for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){ for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
//N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
// (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
// details, but that seemed really weird. tricky bug to find too. // details, but that seemed really weird. tricky bug to find too.
...@@ -510,36 +510,36 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -510,36 +510,36 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
: first_row; : first_row;
int len3 = min(blockDim.z, kern_len - first_row); int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads(); __syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3, load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
const float* idx_kern=&d_kern[tz*kern_wid]; const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(tz<len3) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
}else{ }else{
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
//reduce no sync because previous loop ends with sync //reduce no sync because previous loop ends with sync
...@@ -548,11 +548,11 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -548,11 +548,11 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
//sum=0; //sum=0;
for(int i=1;i<blockDim.z;i++){ for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*out_wid*out_len]; sum+=d_reduce[thread_id+i*out_wid*out_len];
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
...@@ -570,12 +570,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -570,12 +570,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows( float* img, float* kern, float* out, conv_rows( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -599,9 +599,9 @@ conv_rows( float* img, float* kern, float* out, ...@@ -599,9 +599,9 @@ conv_rows( float* img, float* kern, float* out,
kern+=kern_stride_nkern*kern_id;//the good nkern kern+=kern_stride_nkern*kern_id;//the good nkern
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len, load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads(); __syncthreads();
float sum = 0.0f; float sum = 0.0f;
...@@ -613,8 +613,8 @@ conv_rows( float* img, float* kern, float* out, ...@@ -613,8 +613,8 @@ conv_rows( float* img, float* kern, float* out,
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -631,12 +631,12 @@ conv_rows( float* img, float* kern, float* out, ...@@ -631,12 +631,12 @@ conv_rows( float* img, float* kern, float* out,
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows_stack( float* img, float* kern, float* out, conv_rows_stack( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -698,21 +698,21 @@ conv_rows_stack( float* img, float* kern, float* out, ...@@ -698,21 +698,21 @@ conv_rows_stack( float* img, float* kern, float* out,
offset = kern_stride_nkern * kern_id + kern_stride_stack * stack; offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads();//to be sure all thread have finished before we modif the shared memory. __syncthreads();//to be sure all thread have finished before we modif the shared memory.
} }
if (out_row < out_len) if (out_row < out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -729,12 +729,12 @@ conv_rows_stack( float* img, float* kern, float* out, ...@@ -729,12 +729,12 @@ conv_rows_stack( float* img, float* kern, float* out,
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern> template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void __global__ void
conv_rows_stack2( float* img, float* kern, float* out, conv_rows_stack2( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -763,54 +763,54 @@ conv_rows_stack2( float* img, float* kern, float* out, ...@@ -763,54 +763,54 @@ conv_rows_stack2( float* img, float* kern, float* out,
__syncthreads(); __syncthreads();
load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1, load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(preload_full_kern) if(preload_full_kern)
load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack, load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
__syncthreads(); __syncthreads();
if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){ if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
_idx1+=(row+nb_rows-1)*img_stride_row;//the current last row _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid, load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
img+_idx1, thread_id, nb_thread_id, img_wid, 1, img+_idx1, thread_id, nb_thread_id, img_wid, 1,
img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer. img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
} }
if(!preload_full_kern){ if(!preload_full_kern){
int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
_idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
load_to_shared(d_kern, kern+_idx3, load_to_shared(d_kern, kern+_idx3,
thread_id, nb_thread_id, kern_wid,1, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
} }
__syncthreads(); __syncthreads();
//if needed as on Fermi as reading out of bound index from shared memory generate an error. //if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len){ if(out_row<out_len){
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col]; const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f; float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission. sum+=sum_;//We pass by an intermediate variable to have more precission.
} }
} }
} }
__syncthreads(); __syncthreads();
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -854,8 +854,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -854,8 +854,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_rr = iR_logical + kern_len - 1 - rr; int img_rr = iR_logical + kern_len - 1 - rr;
int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y; int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
float sum = 0.0f; float sum = 0.0f;
if(stack_loop){ if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){ for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R; float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C; float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc) for (int cc = 0; cc < kern_wid; ++cc)
...@@ -864,17 +864,17 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -864,17 +864,17 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
kk_0 += kern_str_C; kk_0 += kern_str_C;
ii_0 -= img_str_C; ii_0 -= img_str_C;
} }
}
}else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
} }
} }else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
if (blockDim.x * blockDim.y == 1) if (blockDim.x * blockDim.y == 1)
{ {
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论