提交 553b256e authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge pull request #540 from nouiz/gpuconv

Gpuconv
...@@ -10,7 +10,7 @@ Documentation ...@@ -10,7 +10,7 @@ Documentation
Interface changes Interface changes
* In 0.5, we removed the deprecated sharedvar.value property. * In 0.5, we removed the deprecated sharedvar.value property.
Now we raise an error if you access it. Now we raise an error if you access it. (Frederic B.)
* theano.function does not accept duplicate inputs, so function([x, x], ...) * theano.function does not accept duplicate inputs, so function([x, x], ...)
does not work anymore. (Pascal L.) does not work anymore. (Pascal L.)
* theano.function now raises an error if some of the provided inputs are * theano.function now raises an error if some of the provided inputs are
...@@ -23,15 +23,16 @@ New Features ...@@ -23,15 +23,16 @@ New Features
* debugprint new param ids=["CHAR", "id", "int", ""] * debugprint new param ids=["CHAR", "id", "int", ""]
This makes the identifier printed to be the python id, a unique char, a This makes the identifier printed to be the python id, a unique char, a
unique int, or not have it printed. We changed the default to be "CHAR" unique int, or not have it printed. We changed the default to be "CHAR"
as this is more readable. as this is more readable. (Frederic B.)
* debugprint new param stop_on_name=[False, True]. If True, we don't print * debugprint new param stop_on_name=[False, True]. If True, we don't print
anything below an intermediate variable that has a name. Defaults to False. anything below an intermediate variable that has a name. Defaults to False.
* debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
* debugprint does not print anymore the "|" symbol in a column after the last input. (Frederic B.)
* If you use Enthought Python Distribution (EPD) now we use its blas * If you use Enthought Python Distribution (EPD) now we use its blas
implementation by default. implementation by default. (Frederic B.)
Sparse Sandbox graduate Sparse Sandbox graduate
* Remove0 op: it remove store element with value 0. * Remove0 op: it remove store element with value 0. (Frederic B.)
Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people) Sparse Sandbox Addition (Not reviewed/documented/tested, but used by some people)
* They are all in the theano.sparse.sandbox.sp2 module * They are all in the theano.sparse.sandbox.sp2 module
...@@ -50,7 +51,9 @@ Crash Fix ...@@ -50,7 +51,9 @@ Crash Fix
empty string (Frederic B.) empty string (Frederic B.)
* When importing theano on a computer without GPU with the Theano * When importing theano on a computer without GPU with the Theano
flags 'device' or 'init_gpu_device' set to gpu* (Frederic B., reported by Luo Heng) flags 'device' or 'init_gpu_device' set to gpu* (Frederic B., reported by Luo Heng)
* Optimization print useless error when scipy is not available. (Frederic B.)
* Gpu conv crash/slowdown on newer hardware? (James B.)
* Better error handling in gpu conv (Frederic B.)
============= =============
Release Notes Release Notes
......
...@@ -704,7 +704,7 @@ class GpuConv(GpuOp): ...@@ -704,7 +704,7 @@ class GpuConv(GpuOp):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 17) return (0, 18)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of # REMEMBER TO RAISE c_code_cache_version when changing any of
......
...@@ -9,8 +9,8 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * ...@@ -9,8 +9,8 @@ PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray *
*/ */
int int
CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows, int subsample_cols, CudaNdarray * out, int subsample_rows, int subsample_cols,
int version = -1, int verbose=0) int version = -1, int verbose=0)
{ {
int work_complete = 0; int work_complete = 0;
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
...@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -32,14 +32,29 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (verbose>1) if (verbose>1)
{ {
fprintf(stderr, "INFO: Running conv_valid version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); fprintf(stderr,
fprintf(stderr, "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", "INFO: Running conv_valid version=%d,"
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], " MACRO kern_width=%d with inputs:\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); version, THEANO_KERN_WID);
fprintf(stderr, "INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", fprintf(stderr,
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], "INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
fprintf(stderr, "INFO: subsample_rows=%d, subsample_cols=%d\n", subsample_rows, subsample_cols); CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
fprintf(stderr,
"INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
fprintf(stderr,
"INFO: subsample_rows=%d, subsample_cols=%d\n",
subsample_rows, subsample_cols);
} }
//Check the output size is valid //Check the output size is valid
...@@ -84,8 +99,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -84,8 +99,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
const int out_size_byte = out_size*sizeof(float); const int out_size_byte = out_size*sizeof(float);
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received had %d columns!", " %d kernel columns, but the kernel we received had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1; return -1;
} }
...@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -98,9 +113,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid); bool img_contiguous_2d = (img_stride_col == 1) && (img_stride_row==img_wid);
bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid); bool kern_contiguous_2d = (kern_stride_col == 1) && (kern_stride_row==kern_wid);
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy //if the lower 2 dims are c_contiguous but flipped, unflipping the
// stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster) //allow to use a version that use less registers(so is faster)
//the unflipped version of variable haev the original value when we don't need to unflip it, but have the new value when we unflip it. //the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; float * kern_data_unflipped = kern->devdata;
...@@ -115,242 +132,285 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -115,242 +132,285 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]); kern_data_unflipped=&(kern->devdata[(kern_wid-1)*kern_stride_col + (kern_len-1)*kern_stride_row]);
} }
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers. //if we remove the restriction
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){ //img_size_byte+kern_size_byte>8*1024, we can enter in condition where
//we will lower the occupency due to shared memory and/or registers.
if ((version == -1) &&
(out_size<64 || img_size_byte+kern_size_byte>8*1024) &&
out_size<=256){
//condition for exec //condition for exec
if(!subsample && if(!subsample &&
out_contiguous && out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce! std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) !work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed. version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
} }
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) && (version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch !work_complete) //conv_patch
{ {
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing. if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch, nkern); dim3 grid(nbatch, nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_PATCH_SPECIAL(kern_wid) \ #define CONV_PATCH_SPECIAL(kern_wid) \
if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\ if(threads.y==out_len) f=conv_patch<true,kern_wid,false>;\
else f=conv_patch<true,kern_wid,true>; else f=conv_patch<true,kern_wid,true>;
CONV_PATCH_SPECIAL(THEANO_KERN_WID); CONV_PATCH_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata, (img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack); img_len, img_wid, kern_len, kern_wid, nkern, nstack);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_patch' version %s nb_split=%d\n",threads.y==out_len?"no split": "split",nb_split); if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch' version %s nb_split=%d\n",
threads.y==out_len ? "no split": "split", nb_split);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i, nb_split=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, nb_split); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i, nb_split=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y, nb_split);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (out_contiguous && if (out_contiguous &&
(version==1||version==3||version==11||version==12||version==-1) && (version==1||version==3||version==11||version==12||version==-1) &&
(version!=1 || out_size<512) &&//Maximum of 512 theads by block (version!=1 || out_size<512) &&//Maximum of 512 theads by block
out_wid<512 &&//Maximum of 512 theads by block out_wid<512 &&//Maximum of 512 theads by block
img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory img_size_byte+kern_wid*sizeof(float)<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch_stack !work_complete) //conv_patch_stack
{ {
//version 1 is without split and preload the full kernel //version 1 is without split and preload the full kernel
//version 3 is with split and preload the full kernel //version 3 is with split and preload the full kernel
//version 11 is without split and load only 1 kernel row at a time. //version 11 is without split and load only 1 kernel row at a time.
//version 12 is with split and load only 1 kernel row at a time. //version 12 is with split and load only 1 kernel row at a time.
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing. if((version==3||version==12) && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++; while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail; bool preload_full_kernel = (img_size_byte + kern_size_byte) <shared_avail;
if(version==11 || version==12) preload_full_kernel=false; if(version==11 || version==12) preload_full_kernel=false;
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float); int shared_size=(img_size + (preload_full_kernel?kern_size:kern_wid))*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \ #define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \ if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,true>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,true>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,true>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,true>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,true>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,true>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,true>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true,false>;} \
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true,false>;}\
else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true,false>;}\
else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false,false>;}\
else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false,false>;}\
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false,false>;} \
else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;} else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d && !subsample){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern->devdata, out->devdata, (img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
out_len, out_wid, nkern, nstack, out_len, out_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col, kern_stride_row, img_stride_batch, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols); kern_stride_stack, kern_stride_nkern, subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) if (verbose>1)
fprintf(stderr, fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," " shared_size=%i, nb_threads=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,", " kern_flipped=true, accumulate=false, kern_width=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " img_c_contiguous_2d=%i,"
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, " kern_c_contiguous_2d=%i, nb_split=%i,"
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, " preload_full_kernel=%i,",
nb_split, preload_full_kernel, subsample_rows, subsample_cols); " subsample_rows=%i, subsample_cols=%i\n",
if (verbose) fprintf(stderr, threads.x, threads.y, grid.x, grid.y,
"INFO: used 'conv_patch_stack' version with nb_split=%i and preload_full_kernel=%i," shared_size, threads.x * threads.y,
" subsample_rows=%i, subsample_cols=%i\n", THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split,preload_full_kernel, subsample_rows, subsample_cols); nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_patch_stack' version with nb_split=%i"
" and preload_full_kernel=%i,"
" subsample_rows=%i, subsample_cols=%i\n",
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) if (verbose)
fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i," fprintf(stderr,
" kern_flipped=true, accumulate=false, kern_width=%i, img_c_contiguous_2d=%i," "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" kern_c_contiguous_2d=%i, nb_split=%i, preload_full_kernel=%i,", " shared_size=%i, nb_threads=%i,"
" subsample_rows=%i, subsample_cols=%i\n", " kern_flipped=true, accumulate=false,"
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y, " kern_width=%i, img_c_contiguous_2d=%i,"
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d, " kern_c_contiguous_2d=%i, nb_split=%i,"
nb_split, preload_full_kernel, subsample_rows, subsample_cols); " preload_full_kernel=%i,"
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n", " subsample_rows=%i, subsample_cols=%i\n",
cudaGetErrorString(sts)); threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,
THEANO_KERN_WID, img_contiguous_2d, kern_contiguous_2d,
nb_split, preload_full_kernel,
subsample_rows, subsample_cols);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==4||version==-1) && (version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<512 &&//Maximum of 512 threads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows !work_complete) //conv_rows
{ {
dim3 threads(out_wid); dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern); dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float); int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \ #define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\ if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\ else f = conv_rows<kern_wid, true>;\
CONV_ROWS_SPECIAL(THEANO_KERN_WID); CONV_ROWS_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, kern->devdata, out->devdata, (img->devdata, kern->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_rows' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_rows' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==5||version==-1) && (version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block out_wid<512 &&//Maximum of 512 theads by block
img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory img_wid*kern_len*sizeof(float)+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_rows_stack !work_complete) //conv_rows_stack
{ {
int nb_row=1; int nb_row=1;
int max_threads=512; int max_threads=512;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP. //TODO:if not c_contiguous, lower max_thread as we use 22
for(int i=2;i<=out_len;i++){ //registers by thread and we won't execute 2 block in one MP.
if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail) for(int i=2;i<=out_len;i++){
nb_row=i; if((i)*out_wid<max_threads && ((kern_len+i)*img_wid + kern_size)*sizeof(float)<shared_avail)
} nb_row=i;
}
dim3 threads(out_wid,nb_row); dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float); int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
if (0) if (0)
fprintf(stderr, "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n", fprintf(stderr,
img_contiguous_2d, kern_contiguous_2d, "IMG CONTIG %i KERN_CONTIG %i (%i %i %i) (%i %i %i)\n",
threads.x, threads.y, threads.z, img_contiguous_2d, kern_contiguous_2d,
grid.x, grid.y, grid.z); threads.x, threads.y, threads.z,
grid.x, grid.y, grid.z);
if(!img_contiguous_2d || !kern_contiguous_2d) { if(!img_contiguous_2d || !kern_contiguous_2d) {
//fprintf(stderr, "using false version\n"); //fprintf(stderr, "using false version\n");
f = conv_rows_stack<THEANO_KERN_WID, false>; f = conv_rows_stack<THEANO_KERN_WID, false>;
} else { } else {
...@@ -358,106 +418,136 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -358,106 +418,136 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
f = conv_rows_stack<THEANO_KERN_WID, true>; f = conv_rows_stack<THEANO_KERN_WID, true>;
} }
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose>1)
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack' version\n"); fprintf(stderr,
"threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: used 'conv_rows_stack' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (!subsample && out_contiguous && if (!subsample && out_contiguous &&
(version==9||version==10||version==-1) && (version==9||version==10||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<512 &&//Maximum of 512 threads by block
(img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory (img_wid+kern_wid)*sizeof(float)<shared_avail && //their is only 16k of shared memory
(version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory (version != 9 || (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) && //version 9 use more memory
!work_complete) //conv_rows_stack2 !work_complete) //conv_rows_stack2
{ {
// version 9:we preload the full kernel // version 9:we preload the full kernel
// version 10: load only a few row at a time. // version 10: load only a few row at a time.
int nb_row=1; int nb_row=1;
int max_threads=512; int max_threads=512;
int version_back = version; int version_back = version;
//TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP. //TODO:if not c_contiguous, lower max_thread as we use 22 registers by thread and we won't execute 2 block in one MP.
if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail) if(version==-1 && (img_wid+kern_len*kern_wid)*sizeof(float)<shared_avail)
version = 9; version = 9;
else if(version==-1)version = 10; else if(version==-1)version = 10;
int k_size = kern_size; int k_size = kern_size;
if(version==10) if(version==10)
k_size=kern_wid; k_size=kern_wid;
for(int i=2;i<=out_len;i++){ for(int i=2;i<=out_len;i++){
if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail) if(i*out_wid<max_threads && (i*img_wid + k_size)*sizeof(float)<shared_avail)
nb_row=i; nb_row=i;
} }
//to test the case when we don't have a thread by output pixel. //to test the case when we don't have a thread by output pixel.
if((version_back!=-1)&& nb_row>1) nb_row--; if((version_back!=-1)&& nb_row>1) nb_row--;
dim3 threads(out_wid,nb_row); dim3 threads(out_wid,nb_row);
dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern); dim3 grid(ceil_intdiv(out_len,nb_row), nbatch*nkern);
int shared_size=(threads.y*img_wid + k_size)*sizeof(float); int shared_size=(threads.y*img_wid + k_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_ROWS_STACK2_SPECIAL(kern_wid) \ #define CONV_ROWS_STACK2_SPECIAL(kern_wid) \
if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\ if((!img_contiguous_2d || !kern_contiguous_2d)&&version==9) f = conv_rows_stack2<kern_wid, false,true>;\
else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\ else if(version==9) f = conv_rows_stack2<kern_wid, true,true>;\
else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\ else if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack2<kern_wid, false, false>;\
else f = conv_rows_stack2<kern_wid, true, false>; else f = conv_rows_stack2<kern_wid, true, false>;
CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID); CONV_ROWS_STACK2_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size >>> f<<< grid, threads, shared_size >>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_col, img_stride_row,
img_stride_stack,img_stride_batch, img_stride_stack,img_stride_batch,
kern_stride_col, kern_stride_row, kern_stride_col, kern_stride_row,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", if (verbose>1)
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); fprintf(stderr,
if (verbose) fprintf(stderr, "INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_rows_stack2' version %s with"
" %d row(s).\n",
(version==9?"'load full kernel'":
"'load 1 kern row at a time'"),nb_row);
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n", if (verbose)
threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y,(version==9?2:3)); fprintf(stderr,
if (verbose) fprintf(stderr, "INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n", "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
cudaGetErrorString(sts)); " shared_size=%i, nb_threads=%i version=%d\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y,(version==9?2:3));
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_rows_stack2' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
...@@ -469,24 +559,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -469,24 +559,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
//version 8 is needed to test more easily this kernel template parameter. //version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time. //version 13 load only 1 kernel row at a time.
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
(version==7||version==8||version==13||version==-1) && (version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split. (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
//version 13 need a minimal kernel length as big as the split. //version 13 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && (version!=13||kern_len>1) &&
!work_complete) //conv_patch_stack_reduce !work_complete) //conv_patch_stack_reduce
{ {
int nb_split=1; int nb_split=1;
int full_kern=true; int full_kern=true;
if(version==8||version==13) nb_split++;//force the split. if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false; if(version==13)full_kern=false;
//check if we can fit the full kernel in the shared memory //check if we can fit the full kernel in the shared memory
if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){ if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
full_kern = false; full_kern = false;
} }
//thread_z is going to be ceil_intdiv(kern_len, nb_split) //thread_z is going to be ceil_intdiv(kern_len, nb_split)
// we need enough splits so that // we need enough splits so that
...@@ -496,24 +586,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -496,24 +586,24 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
// constraint (a) // constraint (a)
// device 1.3 have a max of 64 thread in z // device 1.3 have a max of 64 thread in z
while(ceil_intdiv(kern_len,nb_split)>64) nb_split++; while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
// constraint (b) // constraint (b)
// (TODO: read the number of threads per block from the device) // (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++; while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
// tentative estimates (prior to contraint c) // tentative estimates (prior to contraint c)
int thread_z=ceil_intdiv(kern_len,nb_split); int thread_z=ceil_intdiv(kern_len,nb_split);
int shared_size = sizeof(float)*(full_kern int shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
// constraint (c) // constraint (c)
while ((shared_size >= shared_avail) && (nb_split <= kern_len)){ while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
//if we can't fit the kernel in shared memory, we must split it more. //if we can't fit the kernel in shared memory, we must split it more.
nb_split++; nb_split++;
thread_z=ceil_intdiv(kern_len,nb_split); thread_z=ceil_intdiv(kern_len,nb_split);
shared_size = sizeof(float)*(full_kern shared_size = sizeof(float)*(full_kern
? std::max(img_size + kern_size, out_size*thread_z) ? std::max(img_size + kern_size, out_size*thread_z)
: std::max(img_size + thread_z*kern_wid, out_size*thread_z)); : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
} }
...@@ -556,11 +646,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -556,11 +646,18 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>; else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n", if (verbose)
kern_flipped,ccontig,nb_split,full_kern); fprintf(stderr,
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", "INFO: using 'conv_patch_stack_reduce' version"
threads.x, threads.y, threads.z, grid.x, grid.y, " kern_flipped=%i ccontig=%i nb_split=%d,"
shared_size, threads.x * threads.y * threads.z); " preload_full_kern=%d\n",
kern_flipped, ccontig, nb_split, full_kern);
if (verbose>1)
fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i,"
" grid.y=%i, shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata, f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, img_len, img_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
...@@ -575,26 +672,36 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -575,26 +672,36 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i,shared_size=%i,"
" nb_threads=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_patch_stack_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} // else no good nb_splits was found } // else no good nb_splits was found
} }
if (1 && (version==6||version==-1) && if (1 && (version==6||version==-1) &&
kern_len<=320 && kern_len<=320 &&
!work_complete) //conv_valid_row_reduce !work_complete) //conv_valid_row_reduce
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int block_nstack=nstack; int block_nstack=nstack;
//Max of 512 threads per blocks. //Max of 512 threads per blocks.
//On old hardware, we have a max of 356 threads as we have only //On old hardware, we have a max of 356 threads as we have only
//8k registers and the kernel use 23 register //8k registers and the kernel use 23 register
//TODO: check if we have 8k or 16k of register... //TODO: check if we have 8k or 16k of register...
while(block_nstack*kern_len>320)block_nstack--; while(block_nstack*kern_len>320)block_nstack--;
dim3 n_threads(block_nstack, kern_len, 1); dim3 n_threads(block_nstack, kern_len, 1);
int n_reduce_buf = block_nstack * kern_len * sizeof(float); int n_reduce_buf = block_nstack * kern_len * sizeof(float);
...@@ -615,21 +722,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -615,21 +722,21 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
void (*f)(int, int, int, int, void (*f)(int, int, int, int,
int, int, int, int, int, int, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
float*, int, int, int, int, float*, int, int, int, int,
int, int, int); int, int, int);
//std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n"; //std::cerr << "initial_reduce_boundary " << initial_reduce_boundary << "\n";
//std::cerr << "kerns " << nstack << " " << kern_len << "\n"; //std::cerr << "kerns " << nstack << " " << kern_len << "\n";
//std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n"; //std::cerr << "n_reduce_buf/sizeof(float) " << n_reduce_buf / sizeof(float) << "\n";
if(block_nstack==nstack) if(block_nstack==nstack)
f=conv_valid_row_reduce<false>; f=conv_valid_row_reduce<false>;
else else
f=conv_valid_row_reduce<true>; f=conv_valid_row_reduce<true>;
f<<<n_blocks, n_threads, n_reduce_buf>>>( f<<<n_blocks, n_threads, n_reduce_buf>>>(
nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
...@@ -651,13 +758,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -651,13 +758,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_valid_row_reduce' version\n");
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, shared_size=%i, nb_threads=%i\n", n_threads.x, n_threads.y, n_blocks, n_reduce_buf, n_threads.x * n_threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_valid_row_reduce' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i,"
" shared_size=%i, nb_threads=%i\n",
n_threads.x, n_threads.y, n_blocks,
n_reduce_buf, n_threads.x * n_threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_valid_row_reduce' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
...@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -665,32 +781,61 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
{ {
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (1) if (1)
{ {
if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n"); if (verbose)
if (verbose>1) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, "INFO: launching conv_reference_valid\n");
nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid, if (verbose>1)
img->devdata, fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); nbatch, CudaNdarray_HOST_DIMS(img)[1], img_len, img_wid,
if (verbose>1) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", img->devdata,
nkern, nstack, kern_len, kern_wid, CudaNdarray_HOST_STRIDES(img)[0],
kern->devdata, CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] CudaNdarray_HOST_STRIDES(img)[2],
); CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose>1) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", if (verbose>1)
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid, fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
out->devdata, nkern, nstack, kern_len, kern_wid,
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); kern->devdata,
if (verbose>1) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads); CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
if (verbose>1)
fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_DIMS(out)[1], out_len, out_wid,
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose>1)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
} }
conv_reference_valid<<<n_blocks, n_threads>>>( nbatch, nkern, CudaNdarray_HOST_DIMS(img)[1], conv_reference_valid<<<n_blocks, n_threads>>>(nbatch, nkern,
CudaNdarray_HOST_DIMS(img)[1],
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
out_len, out_wid, out_len, out_wid,
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3], img->devdata,
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], CudaNdarray_HOST_STRIDES(img)[0],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
...@@ -698,26 +843,37 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -698,26 +843,37 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
work_complete = true; work_complete = true;
if (verbose) fprintf(stderr, "INFO: used 'conv_reference_valid' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_reference_valid' version\n");
} }
else else
{ {
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_valid! (%s)", PyErr_Format(PyExc_RuntimeError,
cudaGetErrorString(sts)); "ERROR: all implementations failed for"
" CudaNdarray_conv_valid! (%s)",
cudaGetErrorString(sts));
return -1; return -1;
} }
} }
assert (work_complete); if (!work_complete)
{
PyErr_Format(PyExc_RuntimeError,
"ERROR: no implementation(s) worked for"
" CudaNdarray_conv_valid!"
" Version asked(%d) (-1 mean use an heuristic)",
version);
return -1;
}
return 0; return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
//return -1;
} }
int int
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0) CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern,
CudaNdarray * out, int subsample_rows,
int subsample_cols, int version = -1, int verbose=0)
{ {
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file. //144 is the biggest static shared size used with compiling this file.
const int shared_avail = SHARED_SIZE - 150;
int work_complete = 0; int work_complete = 0;
if (img->nd != 4) if (img->nd != 4)
...@@ -775,10 +931,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -775,10 +931,13 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
//const int out_size_byte = out_size*sizeof(float); // unused //const int out_size_byte = out_size*sizeof(float); // unused
if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) || (THEANO_KERN_WID==0))){ if (!((THEANO_KERN_WID == CudaNdarray_HOST_DIMS(kern)[3]) ||
PyErr_Format(PyExc_ValueError, "ERROR: This GpuConv code was compiled for" (THEANO_KERN_WID == 0))){
" %d kernel columns, but the kernel we received had %d columns!", PyErr_Format(PyExc_ValueError,
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]); "ERROR: This GpuConv code was compiled for"
" %d kernel columns, but the kernel we received"
" had %d columns!",
THEANO_KERN_WID, CudaNdarray_HOST_DIMS(kern)[3]);
return -1; return -1;
} }
bool subsample = subsample_rows!=1 || subsample_cols!=1; bool subsample = subsample_rows!=1 || subsample_cols!=1;
...@@ -793,9 +952,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -793,9 +952,11 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy //if the lower 2 dims are c_contiguous but flipped, unflipping the
//stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster) //allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it. //the unflipped version of variable have the original value when
//we don't need to unflip it, but have the new value when we unflip it.
bool kern_flipped=true; bool kern_flipped=true;
bool kern_contiguous_2d_unflipped = kern_contiguous_2d; bool kern_contiguous_2d_unflipped = kern_contiguous_2d;
float * kern_data_unflipped = kern->devdata; float * kern_data_unflipped = kern->devdata;
...@@ -812,127 +973,164 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -812,127 +973,164 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
if (verbose>1) if (verbose>1)
{ {
printf("INFO: Running conv_full version=%d, MACRO kern_width=%d with inputs:\n",version,THEANO_KERN_WID); printf("INFO: Running conv_full version=%d,"
" MACRO kern_width=%d with inputs:\n", version, THEANO_KERN_WID);
printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n", printf("INFO: img dim: %i %i %i %i img stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],CudaNdarray_HOST_DIMS(img)[2],CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1],CudaNdarray_HOST_STRIDES(img)[2],CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n", printf("INFO: kern dim: %i %i %i %i kern stride: %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],CudaNdarray_HOST_DIMS(kern)[2],CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1],CudaNdarray_HOST_STRIDES(kern)[2],CudaNdarray_HOST_STRIDES(kern)[3]); CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]);
} }
if (!subsample && if (!subsample &&
out_contiguous && out_contiguous &&
(version==3||version==4||version==5||version==-1) && (version==3||version==4||version==5||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block out_wid<512 &&//Maximum of 512 threads by block
(kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory (kern_len+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack_padded !work_complete) //conv_full_patch_stack_padded
{ {
//version 3 without split //version 3 without split
//version 4 with split (more registers) //version 4 with split (more registers)
//version 5 with split (more registers) low mem version(some restriction and still more register) //version 5 with split (more registers) low mem version(some restriction and still more register)
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.) int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing. if((version==4 || version==5) && out_len>1) nb_split++;//to force the use of split=true when testing.
if(kern_len==1 && version==5){ if(kern_len==1 && version==5){
//version 5 don't support kern_len==1 as 1%0 return -1. //version 5 don't support kern_len==1 as 1%0 return -1.
version=-1; version=-1;
if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n"); if(verbose)fprintf(stderr, "WARNING:conv full: Asking version 5 with kern_len==1. Combination not supported!\n");
} }
if(img_size_padded_byte+kern_size_byte>shared_avail) version=5; if(img_size_padded_byte+kern_size_byte>shared_avail) version=5;
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration. //we pass by ceil_intdiv in case the out_len is not a multiple
//Max of 16k of shared memory //of nb_split, we want nb_split the number of iteration.
if(version==5) //Max of 16k of shared memory
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++; if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512. //327 as we use 25 register
int max_thread = (version!=5?327:450); //version 5 will have only 1 block running at a time, so we
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++; //can use 32 registers per threads, but their is some other stuff that
if(version==-1 && out_size>512)version=4; //for the limit to bu lower then 512.
if(version==-1)version=3; int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1 && nb_split>1) version=4; if(version==-1)version=3;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more than 1 split as to be always execute.
if(version==-1 && nb_split>1) version=4;
assert(version!=3 || nb_split==1); else if(version==-1) version=3;
assert(version!=5 || kern_len>1); //force version 4 when more than 1 split are needed to always execute.
assert(version!=-1); else if(version==3 && nb_split!=1) version=4;
assert(version!=3 || nb_split==1);
assert(version!=5 || kern_len>1);
assert(version!=-1);
dim3 threads(out_wid, ceil_intdiv(out_len,nb_split)); dim3 threads(out_wid, ceil_intdiv(out_len,nb_split));
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=img_size_padded_byte + kern_size_byte; int shared_size=img_size_padded_byte + kern_size_byte;
if(version==5) if(version==5)
shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte; shared_size=((kern_len+threads.y-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte;
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int); int, int);
#define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \ #define CONV_FULL_PATCH_STACK_PADDED_SPECIAL(kern_wid) \
if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\ if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,true,false,true>;\
else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\ else if(version==3 && kern_flipped) f=conv_full_patch_stack_padded<true,kern_wid,false,false,false>;\
else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\ else if(version==4 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,true,false>;\
else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\ else if(version==5 && kern_flipped)f=conv_full_patch_stack_padded<true,kern_wid,false,false,true>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==3) f=conv_full_patch_stack_padded<false,kern_wid,true,false,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==4) f=conv_full_patch_stack_padded<false,kern_wid,true,true,false>;\
else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\ else if(img_contiguous_2d && kern_contiguous_2d_unflipped && version==5) f=conv_full_patch_stack_padded<false,kern_wid,true,false,true>;\
else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\ else if(version==3) f=conv_full_patch_stack_padded<false,kern_wid,false,false,false>;\
else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\ else if(version==4) f=conv_full_patch_stack_padded<false,kern_wid,false,true,false>;\
else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\ else if(version==5) f=conv_full_patch_stack_padded<false,kern_wid,false,false,true>;\
else assert(false); else assert(false);
CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID); CONV_FULL_PATCH_STACK_PADDED_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, kern_data_unflipped, out->devdata, (img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid, nkern, nstack, img_len, img_wid, kern_len, kern_wid, nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_col, img_stride_row, img_stride_stack,
img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped, img_stride_batch, kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern); kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose>1)
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack_padded' nb_split=%d low_mem=%s\n",nb_split,(version==5?"true":"false")); fprintf(stderr,
"threads.x=%i, threads.y=%i, threads.z=%i,"
" grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: used 'conv_full_patch_stack_padded'"
" nb_split=%d low_mem=%s\n",
nb_split, (version==5?"true":"false"));
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i, out_len=%i, nb_split=%i, version=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z, out_len, nb_split, version); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack_padded' %s %s failed (%s), trying next implementation\n", fprintf(stderr,
version==3?"no split": "split",(version==5?"low_mem":"not_low_mem"), "threads.x=%i, threads.y=%i, threads.z=%i,"
cudaGetErrorString(sts)); " grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i,"
" out_len=%i, nb_split=%i, version=%i\n",
threads.x, threads.y, threads.z,
grid.x, grid.y, shared_size,
threads.x * threads.y * threads.z,
out_len, nb_split, version);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch_stack_padded' %s %s"
" failed (%s), trying next implementation\n",
version==3?"no split": "split",
(version==5?"low_mem":"not_low_mem"),
cudaGetErrorString(sts));
} }
} }
if (!subsample && c_contiguous && if (!subsample && c_contiguous &&
(version==0||version==-1) && (version==0||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel. nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch !work_complete) //conv_full_patch
{ {
dim3 threads(out_wid, out_len); dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
conv_full_patch<<< grid, threads, shared_size>>> conv_full_patch<<< grid, threads, shared_size>>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack); nkern, nstack);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
...@@ -943,37 +1141,45 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -943,37 +1141,45 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr,
"INFO: impl 'conv_full_patch' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (false && !subsample && //disabled as test fail for this kernel if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) && (version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing. (nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything !work_complete) //conv_full_load_everything
{ {
dim3 threads(out_wid, out_len); dim3 threads(out_wid, out_len);
dim3 grid(nbatch); dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float); int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions. //TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//typeof(conv_full_load_everything<0>) f = ; //typeof(conv_full_load_everything<0>) f = ;
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int) = conv_full_load_everything<0>; int, int, int, int, int, int, int, int) = conv_full_load_everything<0>;
f = conv_full_load_everything<THEANO_KERN_WID>; f = conv_full_load_everything<THEANO_KERN_WID>;
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
(img->devdata, (img->devdata,
kern->devdata, kern->devdata,
out->devdata, out->devdata,
img_len, img_wid, img_len, img_wid,
kern_len, kern_wid, kern_len, kern_wid,
nkern, nstack, nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3], CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[1],
...@@ -993,28 +1199,35 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -993,28 +1199,35 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_load_everything' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
} " shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y, shared_size,
threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_load_everything'"
" failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
} }
if (!subsample && if (!subsample &&
img_batch_stack_contiguous && img_batch_stack_contiguous &&
out_contiguous && out_contiguous &&
(version==2||version==-1) && (version==2||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_full_patch_stack !work_complete) //conv_full_patch_stack
{ {
dim3 threads(out_wid, out_len); dim3 threads(out_wid, out_len);
dim3 grid(nbatch,nkern); dim3 grid(nbatch,nkern);
int shared_size=(img_size + kern_size)*sizeof(float); int shared_size=(img_size + kern_size)*sizeof(float);
void (*f)(float*, float*, float*, void (*f)(float*, float*, float*,
int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int,
int, int, int, int); int, int, int, int);
if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\ if(img_contiguous_2d && kern_contiguous_2d) f=conv_full_patch_stack<true,true>;\
else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\ else if(img_contiguous_2d && !kern_contiguous_2d) f=conv_full_patch_stack<true,false>;\
...@@ -1034,14 +1247,21 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1034,14 +1247,21 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n"); if (verbose)
fprintf(stderr, "INFO: used 'conv_full_patch_stack' version\n");
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", threads.x, threads.y, grid.x, grid.y, shared_size, threads.x * threads.y); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n", fprintf(stderr,
cudaGetErrorString(sts)); "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
" shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, grid.x, grid.y,
shared_size, threads.x * threads.y);
if (verbose)
fprintf(stderr, "INFO: impl 'conv_full_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
} }
} }
if (1 && !work_complete) //conv_reference_full if (1 && !work_complete) //conv_reference_full
...@@ -1050,52 +1270,100 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1050,52 +1270,100 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
int outsize = CudaNdarray_SIZE(out); int outsize = CudaNdarray_SIZE(out);
int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS); int n_blocks = std::min(outsize, NUM_VECTOR_OP_BLOCKS);
int n_threads = std::min(ceil_intdiv(outsize, n_blocks), NUM_VECTOR_OP_THREADS_PER_BLOCK); int n_threads = std::min(ceil_intdiv(outsize, n_blocks),
NUM_VECTOR_OP_THREADS_PER_BLOCK);
if (0) if (0)
{ {
if (verbose) fprintf(stderr, "INFO: launching conv_reference_valid\n"); if (verbose)
if (verbose) fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n", fprintf(stderr, "INFO: launching conv_reference_valid\n");
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(img)[1], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], if (verbose)
img->devdata, fprintf(stderr, " img : %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3]); CudaNdarray_HOST_DIMS(img)[0],
if (verbose) fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n", CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(kern)[1], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(img)[2],
kern->devdata, CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3] img->devdata,
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3]);
if (verbose)
fprintf(stderr, " kern: %i %i %i %i %p %i %i %i %i\n",
CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(kern)[1],
CudaNdarray_HOST_DIMS(kern)[2],
CudaNdarray_HOST_DIMS(kern)[3],
kern->devdata,
CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3]
); );
if (verbose) fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n", if (verbose)
CudaNdarray_HOST_DIMS(out)[0], CudaNdarray_HOST_DIMS(out)[1], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], fprintf(stderr, " out : %i %i %i %i %p %i %i %i %i\n",
out->devdata, CudaNdarray_HOST_DIMS(out)[0],
CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3]); CudaNdarray_HOST_DIMS(out)[1],
if (verbose) fprintf(stderr, " launch params: %i %i %i\n", outsize, n_blocks, n_threads); CudaNdarray_HOST_DIMS(out)[2],
if (verbose) fprintf(stderr, " subsample params: %i %i\n", subsample_rows, subsample_cols); CudaNdarray_HOST_DIMS(out)[3],
out->devdata,
CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3]);
if (verbose)
fprintf(stderr, " launch params: %i %i %i\n",
outsize, n_blocks, n_threads);
if (verbose)
fprintf(stderr, " subsample params: %i %i\n",
subsample_rows, subsample_cols);
} }
conv_reference_full<<<n_blocks, n_threads>>>(CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0], CudaNdarray_HOST_DIMS(img)[1], conv_reference_full<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
CudaNdarray_HOST_DIMS(img)[1],
CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3], CudaNdarray_HOST_DIMS(img)[2], CudaNdarray_HOST_DIMS(img)[3],
CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3], CudaNdarray_HOST_DIMS(kern)[2], CudaNdarray_HOST_DIMS(kern)[3],
CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3], CudaNdarray_HOST_DIMS(out)[2], CudaNdarray_HOST_DIMS(out)[3],
img->devdata, CudaNdarray_HOST_STRIDES(img)[0], CudaNdarray_HOST_STRIDES(img)[1], CudaNdarray_HOST_STRIDES(img)[2], CudaNdarray_HOST_STRIDES(img)[3], img->devdata, CudaNdarray_HOST_STRIDES(img)[0],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0], CudaNdarray_HOST_STRIDES(kern)[1], CudaNdarray_HOST_STRIDES(kern)[2], CudaNdarray_HOST_STRIDES(kern)[3], CudaNdarray_HOST_STRIDES(img)[1],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0], CudaNdarray_HOST_STRIDES(out)[1], CudaNdarray_HOST_STRIDES(out)[2], CudaNdarray_HOST_STRIDES(out)[3], CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[3],
kern->devdata, CudaNdarray_HOST_STRIDES(kern)[0],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[3],
out->devdata, CudaNdarray_HOST_STRIDES(out)[0],
CudaNdarray_HOST_STRIDES(out)[1],
CudaNdarray_HOST_STRIDES(out)[2],
CudaNdarray_HOST_STRIDES(out)[3],
subsample_rows, subsample_cols); subsample_rows, subsample_cols);
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError(); cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts) if (cudaSuccess == sts)
{ {
if (verbose) fprintf(stderr, "INFO: used 'conv_reference_full' version ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d nkern=%d nstack=%d subsample=%d\n", if (verbose)
img_len,img_wid, kern_len, kern_wid, fprintf(stderr, "INFO: used 'conv_reference_full' version"
out_len, out_wid, nbatch, nkern, nstack, subsample); " ishp(%d, %d) kshp(%d, %d) oshp(%d, %d) nbatch=%d"
" nkern=%d nstack=%d subsample=%d\n",
img_len,img_wid, kern_len, kern_wid,
out_len, out_wid, nbatch, nkern, nstack, subsample);
work_complete = true; work_complete = true;
} }
else else
{ {
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i\n", n_threads, 1, n_blocks, 1, 0, n_threads); if (verbose)
if (verbose) fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s), trying next implementation\n", fprintf(stderr, "threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i,"
cudaGetErrorString(sts)); " shared_size=%i, nb_threads=%i\n",
PyErr_Format(PyExc_RuntimeError, "ERROR: all implementations failed for CudaNdarray_conv_full! (%s)", n_threads, 1, n_blocks, 1, 0, n_threads);
cudaGetErrorString(sts)); if (verbose)
return -1; fprintf(stderr, "INFO: impl 'conv_reference_full' failed (%s),"
" trying next implementation\n",
cudaGetErrorString(sts));
PyErr_Format(PyExc_RuntimeError,
"ERROR: all implementations failed for"
" CudaNdarray_conv_full! (%s)",
cudaGetErrorString(sts));
return -1;
} }
} }
return 0; return 0;
...@@ -1103,15 +1371,23 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar ...@@ -1103,15 +1371,23 @@ CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdar
PyObject * PyObject *
CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
CudaNdarray * out, const int mode, CudaNdarray * out, const int mode,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
const int version, const int verbose) const int version, const int verbose)
{ {
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified. // Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1. // If the out object is re-used then it is returned, and its refcount is incremented by 1.
// //
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (img->nd != 4)
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} {
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required");
return NULL;
}
int out_dim[4]; int out_dim[4];
out_dim[0] = CudaNdarray_HOST_DIMS(img)[0]; out_dim[0] = CudaNdarray_HOST_DIMS(img)[0];
...@@ -1135,17 +1411,20 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1135,17 +1411,20 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
if ( out if ( out
&& out->nd==4 && out->nd==4
&& CudaNdarray_is_c_contiguous(out) && CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0] && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1] && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2] && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3]) && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
{ {
rval = out; rval = out;
Py_INCREF(rval); Py_INCREF(rval);
} }
else else
{ {
if (out && verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n"); if (out && verbose)
fprintf(stderr,
"INFO: Conv is ignoring 'out' argument with wrong"
" structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null //rval might be null
} }
...@@ -1162,3 +1441,13 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1162,3 +1441,13 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
return (PyObject*)rval; return (PyObject*)rval;
} }
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -37,20 +37,20 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img ...@@ -37,20 +37,20 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
int img_row = out_row; int img_row = out_row;
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
int inverse_row = (img_row-row); int inverse_row = (img_row-row);
if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image if(inverse_row<0 ||inverse_row>=(img_len))continue;//row outside the image
const float* idx_in=&d_img[inverse_row*img_wid]; const float* idx_in=&d_img[inverse_row*img_wid];
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
int img_col = out_col; int img_col = out_col;
int col=0,last=0; int col=0,last=0;
for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col for (col=0,last=img_col; col < kern_wid; col++,last--) {//loop over col
if(last<0 ||last>=(img_wid))continue;//col outside the image if(last<0 ||last>=(img_wid))continue;//col outside the image
sum+=idx_in[last]*idx_kern[col]; sum+=idx_in[last]*idx_kern[col];
} }
} }
out[batch_id*out_len*out_wid+//the output image out[batch_id*out_len*out_wid+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
...@@ -61,8 +61,8 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img ...@@ -61,8 +61,8 @@ conv_full_patch_split( float* img, float* kern, float* out, int img_len, int img
//dynamic shared memory: img_len*img_wid+kern_len*kern_wid //dynamic shared memory: img_len*img_wid+kern_len*kern_wid
__global__ void __global__ void
conv_full_patch( float* img, float* kern, float* out, conv_full_patch( float* img, float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack) int kern_len, int kern_wid, int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -105,12 +105,12 @@ conv_full_patch( float* img, float* kern, float* out, ...@@ -105,12 +105,12 @@ conv_full_patch( float* img, float* kern, float* out,
if(img_col<0){col=-img_col;img_col+=col;} if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col]; sum+=idx_in[col]*idx_kern[col];
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
//we store the full image and the full kernel in the shared memory //we store the full image and the full kernel in the shared memory
...@@ -123,11 +123,11 @@ conv_full_patch( float* img, float* kern, float* out, ...@@ -123,11 +123,11 @@ conv_full_patch( float* img, float* kern, float* out,
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __global__ void
conv_full_patch_stack( float* img, float* kern, float* out, conv_full_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -159,26 +159,26 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -159,26 +159,26 @@ conv_full_patch_stack( float* img, float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue; if(row+out_row-kern_len+1<0 || row+out_row-kern_len+1>=img_len)continue;
const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1]; const float* idx_in=&d_img[(row+out_row-kern_len+1)*img_wid+out_col-kern_wid+1];
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
int col=0; int col=0;
int max_col=kern_wid; int max_col=kern_wid;
int img_col=out_col-kern_wid+1; int img_col=out_col-kern_wid+1;
max_col=min(max_col,img_wid-img_col); max_col=min(max_col,img_wid-img_col);
if(img_col<0){col=-img_col;img_col+=col;} if(img_col<0){col=-img_col;img_col+=col;}
for (; col < max_col; col++, img_col++) {//loop over col for (; col < max_col; col++, img_col++) {//loop over col
sum+=idx_in[col]*idx_kern[col]; sum+=idx_in[col]*idx_kern[col];
} }
} }
//Needed as not all thread finish at the same time the loop //Needed as not all thread finish at the same time the loop
//And we don't want to overwrite the shared memory. //And we don't want to overwrite the shared memory.
__syncthreads(); __syncthreads();
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -202,13 +202,13 @@ conv_full_patch_stack( float* img, float* kern, float* out, ...@@ -202,13 +202,13 @@ conv_full_patch_stack( float* img, float* kern, float* out,
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void __global__ void
conv_full_patch_stack_padded( float* img, float* kern, float* out, conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len + kern_len - 1; out_len = img_len + kern_len - 1;
...@@ -242,26 +242,26 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -242,26 +242,26 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
const int out_row = ty;//output row const int out_row = ty;//output row
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack, for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){ img+=img_stride_stack){
__syncthreads(); __syncthreads();
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img, load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),img,
thread_id,nb_thread_id,img_wid,img_len, thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1, img_stride_col, img_stride_row, kern_wid-1,
c_contiguous); c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
}else if(split && !low_mem){ }else if(split && !low_mem){
fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id); fill(d_img,img_wid_valid*(img_len+2*kern_len-2), 0, thread_id, nb_thread_id);
//out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len. //out_len_max must by higher then out_len as we need all thread when we load the image as the nb_rows is not always a multiple of out_len.
...@@ -269,35 +269,35 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -269,35 +269,35 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
//TODO pass a parameter nb_split //TODO pass a parameter nb_split
out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y; out_len_max = (out_len/blockDim.y+(out_len%blockDim.y==0?0:1))*blockDim.y;
for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){ for(int out_row = ty;out_row<out_len_max;out_row+=nb_rows){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
__syncthreads(); __syncthreads();
//TODO: load only the part of the image needed or put the partial result in shared memory //TODO: load only the part of the image needed or put the partial result in shared memory
load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1), load_padded_col_to_shared(d_img+img_wid_valid*(kern_len-1),
img+img_stride_stack*stack, img+img_stride_stack*stack,
thread_id,nb_thread_id,img_wid,img_len, thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, kern_wid-1, img_stride_col, img_stride_row, kern_wid-1,
c_contiguous); c_contiguous);
load_to_shared(d_kern, kern+kern_stride_stack*stack, load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
//The if is needed as on Fermi as reading out of bound index from shared memory generate an error. //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len) if(out_row<out_len)
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
}else{//low_mem version }else{//low_mem version
//don't need to fill the last rows padding as this is done later. //don't need to fill the last rows padding as this is done later.
...@@ -306,46 +306,46 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -306,46 +306,46 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
__shared__ int out_len_max; __shared__ int out_len_max;
//TODO pass a parameter nb_split //TODO pass a parameter nb_split
if(thread_id==0) if(thread_id==0)
out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows; out_len_max = (out_len/nb_rows+(out_len%nb_rows==0?0:1))*nb_rows;
__syncthreads(); __syncthreads();
for(int out_row = ty, out_row_iter=0;out_row<out_len_max; for(int out_row = ty, out_row_iter=0;out_row<out_len_max;
out_row+=nb_rows, out_row_iter++){ out_row+=nb_rows, out_row_iter++){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
__syncthreads(); __syncthreads();
const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image) const int len_to_load=min(kern_len+nb_rows,img_len-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start const int empty_row = max(kern_len-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack. //we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload const int previous_row = min(out_row_iter*nb_rows,kern_len-1);//number of row from last out_row iteration to reload
load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid, load_padded_col_to_shared(d_img+(kern_len-1-previous_row)*img_wid_valid,
img+img_stride_stack*stack//the good stack image img+img_stride_stack*stack//the good stack image
+(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row. +(out_row_iter*nb_rows-previous_row)*img_stride_row,//the good split top row.
thread_id,nb_thread_id,img_wid, thread_id,nb_thread_id,img_wid,
len_to_load+previous_row, len_to_load+previous_row,
img_stride_col, img_stride_row, kern_wid-1, img_stride_col, img_stride_row, kern_wid-1,
c_contiguous); c_contiguous);
//TODO: fill the last row padding only when needed. //TODO: fill the last row padding only when needed.
//We always fill the last rows padding event when not needed. //We always fill the last rows padding event when not needed.
int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load; int row_to_fill = 2*kern_len-2+nb_rows- empty_row - previous_row - len_to_load;
row_to_fill = min(row_to_fill,kern_len-1); row_to_fill = min(row_to_fill,kern_len-1);
fill(d_img+(kern_len-1+len_to_load)*img_wid_valid, fill(d_img+(kern_len-1+len_to_load)*img_wid_valid,
img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id); img_wid_valid*row_to_fill, 0, thread_id, nb_thread_id);
load_to_shared(d_kern, kern+kern_stride_stack*stack, load_to_shared(d_kern, kern+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
} }
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
} }
...@@ -366,8 +366,8 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -366,8 +366,8 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
template<int NSTACK> template<int NSTACK>
__global__ void __global__ void
conv_full_load_everything( float* img, float* kern, float* out, conv_full_load_everything( float* img, float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
...@@ -442,3 +442,13 @@ conv_full_load_everything( float* img, float* kern, float* out, ...@@ -442,3 +442,13 @@ conv_full_load_everything( float* img, float* kern, float* out,
__syncthreads(); //don't start loading another kernel until we're done here __syncthreads(); //don't start loading another kernel until we're done here
} }
} }
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -20,7 +20,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -20,7 +20,7 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
const %(type)s* idx_kern=&hvals[j*dim_ker[1]]; const %(type)s* idx_kern=&hvals[j*dim_ker[1]];
int new_n = (pos_n+dim_ker[1]-1); int new_n = (pos_n+dim_ker[1]-1);
for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) { for (int k=0,last=new_n; k < dim_ker[1]; k++,last--) {
sum+=idx_kern[k]*idx_in[last]; sum+=idx_kern[k]*idx_in[last];
} }
}//for j }//for j
out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum; out[iter_m*dim_zz[1]+iter_n] %(affectation)s sum;
...@@ -99,9 +99,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -99,9 +99,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
* We load from global memory to shared memory. The outer if is optimized away at compilation. * We load from global memory to shared memory. The outer if is optimized away at compilation.
*/ */
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, __device__ void load_to_shared(float * dst, const float * src, const int thread_id,
int nb_thread, const int nb_col, const int nb_row, int nb_thread, const int nb_col, const int nb_row,
const int stride_col, const int stride_row, const int stride_col, const int stride_row,
const bool flipped=false, const bool c_contiguous=true){ const bool flipped=false, const bool c_contiguous=true){
if (c_contiguous) if (c_contiguous)
{ {
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped); load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
...@@ -143,10 +143,10 @@ __device__ void fill(float * dst, int N, float value, int thread_id, int nb_thre ...@@ -143,10 +143,10 @@ __device__ void fill(float * dst, int N, float value, int thread_id, int nb_thre
* We put the image at the center of another one. Usefull to padd an image with 0. * We put the image at the center of another one. Usefull to padd an image with 0.
*/ */
__device__ void load_padded_col_to_shared(float * dst, const float * src, __device__ void load_padded_col_to_shared(float * dst, const float * src,
const int thread_id, const int nb_thread, const int thread_id, const int nb_thread,
const int nb_col, const int nb_row, const int nb_col, const int nb_row,
const int stride_col, const int stride_row, const int stride_col, const int stride_row,
const int wid_pad, const bool c_contiguous=true){ const int wid_pad, const bool c_contiguous=true){
if(c_contiguous){//flipped==false if(c_contiguous){//flipped==false
for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){ for(int i=thread_id;i<nb_col*nb_row;i+=nb_thread){
int col=i%nb_col; int col=i%nb_col;
...@@ -165,24 +165,24 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src, ...@@ -165,24 +165,24 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src,
} }
template<int i> __device__ float convolutionRowNoFlip(const float *data, template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){ const float *kern){
return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ; return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
//return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern); //return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
} }
template<> __device__ float convolutionRowNoFlip<1>(const float *data, template<> __device__ float convolutionRowNoFlip<1>(const float *data,
const float *kern){ const float *kern){
return data[0]*kern[0]; return data[0]*kern[0];
} }
template<> __device__ float convolutionRowNoFlip<0>(const float *data, template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){ const float *kern){
return 0; return 0;
} }
template<int KERN_WIDTH> template<int KERN_WIDTH>
__device__ void convolutionRowNoFlip(float& sum, __device__ void convolutionRowNoFlip(float& sum,
const float *data, const float *data,
const float *kern, const int kern_wid){ const float *kern, const int kern_wid){
if(KERN_WIDTH>0) if(KERN_WIDTH>0)
sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern); sum+=convolutionRowNoFlip<KERN_WIDTH>(data,kern);
else else
...@@ -219,8 +219,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){ ...@@ -219,8 +219,8 @@ __device__ void store_or_accumulate(float& dst,const float value ){
template<bool flipped_kern, int KERN_WIDTH, bool split> template<bool flipped_kern, int KERN_WIDTH, bool split>
__global__ void __global__ void
conv_patch( float* img, float* kern, float* out, conv_patch( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack) int nkern, int nstack)
{ {
int __shared__ out_len, out_wid, nb_thread_id; int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1; out_len = img_len - kern_len + 1;
...@@ -255,24 +255,24 @@ conv_patch( float* img, float* kern, float* out, ...@@ -255,24 +255,24 @@ conv_patch( float* img, float* kern, float* out,
int out_row = ty;//output row int out_row = ty;//output row
float sum = 0.0f; float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
blockIdx.y*out_wid*out_len+//the output image blockIdx.y*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
}else{ }else{
for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){ for(int out_row=ty;out_row<out_len;out_row+=blockDim.y){
float sum = 0.0f; float sum = 0.0f;
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
} }
...@@ -302,12 +302,12 @@ conv_patch( float* img, float* kern, float* out, ...@@ -302,12 +302,12 @@ conv_patch( float* img, float* kern, float* out,
template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample> template<bool flipped_kern, bool accumulate, int KERN_WIDTH, bool img_c_contiguous_2d, bool kern_c_contiguous_2d, bool split, bool preload_full_kern, bool subsample>
__global__ void __global__ void
conv_patch_stack( float* img, float* kern, float* out, conv_patch_stack( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int out_len, int out_wid, int out_len, int out_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern, int dx, int dy) int kern_stride_stack, int kern_stride_nkern, int dx, int dy)
{ {
int __shared__ nb_thread_id; int __shared__ nb_thread_id;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
...@@ -333,43 +333,43 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -333,43 +333,43 @@ conv_patch_stack( float* img, float* kern, float* out,
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack, for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){ img+=img_stride_stack){
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len, load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d); img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern) if(preload_full_kern)
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){ if(!preload_full_kern){
__syncthreads(); __syncthreads();
int idx2; int idx2;
if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row; if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
else idx2=(row)*kern_stride_row; else idx2=(row)*kern_stride_row;
load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1, load_to_shared(d_kern, kern+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads(); __syncthreads();
} }
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in; const float* idx_in;
if(subsample) if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy]; idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else else
idx_in=&d_img[(row+out_row)*img_wid+out_col]; idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
store_or_accumulate<accumulate>( store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum); out_row*out_wid+out_col],sum);
}else{ }else{
float __shared__ *kern_, *img_; float __shared__ *kern_, *img_;
...@@ -383,54 +383,54 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -383,54 +383,54 @@ conv_patch_stack( float* img, float* kern, float* out,
//TODO: inverse the out_row and stack loop to don't load the date as frequently! //TODO: inverse the out_row and stack loop to don't load the date as frequently!
//TODO: do this happen elsewhere? //TODO: do this happen elsewhere?
for(;out_row<out_len_max;out_row+=blockDim.y){ for(;out_row<out_len_max;out_row+=blockDim.y){
float sum = 0.0f; float sum = 0.0f;
for (int stack = 0;stack<nstack;stack++){ for (int stack = 0;stack<nstack;stack++){
//TODO: load only the part of the image needed or put the partial result in shared memory //TODO: load only the part of the image needed or put the partial result in shared memory
int idx1=img_stride_stack*stack; int idx1=img_stride_stack*stack;
load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len, load_to_shared(d_img,img_+idx1,thread_id,nb_thread_id,img_wid,img_len,
img_stride_col, img_stride_row, false, img_c_contiguous_2d); img_stride_col, img_stride_row, false, img_c_contiguous_2d);
if(preload_full_kern){ if(preload_full_kern){
int idx2=kern_stride_stack*stack; int idx2=kern_stride_stack*stack;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
} }
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){ if(!preload_full_kern){
__syncthreads(); __syncthreads();
int idx2=kern_stride_stack*stack; int idx2=kern_stride_stack*stack;
if(flipped_kern) if(flipped_kern)
idx2+=(kern_len-row-1)*kern_stride_row; idx2+=(kern_len-row-1)*kern_stride_row;
else else
idx2+=(row)*kern_stride_row; idx2+=(row)*kern_stride_row;
load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1, load_to_shared(d_kern, kern_+idx2, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d); kern_stride_col, kern_stride_row, flipped_kern, kern_c_contiguous_2d);
__syncthreads(); __syncthreads();
} }
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in; const float* idx_in;
if(subsample) if(subsample)
idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy]; idx_in=&d_img[(row+out_row*dx)*img_wid+out_col*dy];
else else
idx_in=&d_img[(row+out_row)*img_wid+out_col]; idx_in=&d_img[(row+out_row)*img_wid+out_col];
//if needed as on Fermi as reading out of bound index from shared memory generate an error. //if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len) if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
if(out_row<out_len) if(out_row<out_len)
store_or_accumulate<accumulate>( store_or_accumulate<accumulate>(
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
out_row*out_wid+out_col],sum); out_row*out_wid+out_col],sum);
} }
} }
...@@ -454,11 +454,11 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -454,11 +454,11 @@ conv_patch_stack( float* img, float* kern, float* out,
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern> template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool preload_full_kern>
__global__ void __global__ void
conv_patch_stack_reduce( float* img, float* kern, float* out, conv_patch_stack_reduce( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int img_stride_col,int img_stride_row, int nkern, int nstack, int img_stride_col,int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
//int __shared__ out_len, out_wid, nb_thread_id; //int __shared__ out_len, out_wid, nb_thread_id;
//out_len = img_len - kern_len + 1; //out_len = img_len - kern_len + 1;
...@@ -496,12 +496,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -496,12 +496,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
img+=img_stride_batch*batch_id;//the good batch img+=img_stride_batch*batch_id;//the good batch
for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack, for (int stack = 0;stack<nstack;stack++,kern+=kern_stride_stack,
img+=img_stride_stack){ img+=img_stride_stack){
__syncthreads(); __syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len, load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(split && ! preload_full_kern){ if(split && ! preload_full_kern){
for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){ for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
//N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
// (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
// details, but that seemed really weird. tricky bug to find too. // details, but that seemed really weird. tricky bug to find too.
...@@ -510,36 +510,36 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -510,36 +510,36 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
: first_row; : first_row;
int len3 = min(blockDim.z, kern_len - first_row); int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads(); __syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3, load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
const float* idx_kern=&d_kern[tz*kern_wid]; const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(tz<len3) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
}else{ }else{
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
//reduce no sync because previous loop ends with sync //reduce no sync because previous loop ends with sync
...@@ -548,11 +548,11 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -548,11 +548,11 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
//sum=0; //sum=0;
for(int i=1;i<blockDim.z;i++){ for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*out_wid*out_len]; sum+=d_reduce[thread_id+i*out_wid*out_len];
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
} }
...@@ -570,12 +570,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -570,12 +570,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows( float* img, float* kern, float* out, conv_rows( float* img, float* kern, float* out,
int img_len, int img_wid, int kern_len, int kern_wid, int img_len, int img_wid, int kern_len, int kern_wid,
int nkern, int nstack, int nkern, int nstack,
int img_stride_col, int img_stride_row, int img_stride_col, int img_stride_row,
int img_stride_stack, int img_stride_batch, int img_stride_stack, int img_stride_batch,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -599,9 +599,9 @@ conv_rows( float* img, float* kern, float* out, ...@@ -599,9 +599,9 @@ conv_rows( float* img, float* kern, float* out,
kern+=kern_stride_nkern*kern_id;//the good nkern kern+=kern_stride_nkern*kern_id;//the good nkern
load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len, load_to_shared(d_img,img,thread_id,nb_thread_id,img_wid,kern_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads(); __syncthreads();
float sum = 0.0f; float sum = 0.0f;
...@@ -613,8 +613,8 @@ conv_rows( float* img, float* kern, float* out, ...@@ -613,8 +613,8 @@ conv_rows( float* img, float* kern, float* out,
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -631,12 +631,12 @@ conv_rows( float* img, float* kern, float* out, ...@@ -631,12 +631,12 @@ conv_rows( float* img, float* kern, float* out,
template<int KERN_WIDTH, bool c_contiguous> template<int KERN_WIDTH, bool c_contiguous>
__global__ void __global__ void
conv_rows_stack( float* img, float* kern, float* out, conv_rows_stack( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -698,21 +698,21 @@ conv_rows_stack( float* img, float* kern, float* out, ...@@ -698,21 +698,21 @@ conv_rows_stack( float* img, float* kern, float* out,
offset = kern_stride_nkern * kern_id + kern_stride_stack * stack; offset = kern_stride_nkern * kern_id + kern_stride_stack * stack;
load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len, load_to_shared(d_kern, kern+offset, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+shared_row)*img_wid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads();//to be sure all thread have finished before we modif the shared memory. __syncthreads();//to be sure all thread have finished before we modif the shared memory.
} }
if (out_row < out_len) if (out_row < out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -729,12 +729,12 @@ conv_rows_stack( float* img, float* kern, float* out, ...@@ -729,12 +729,12 @@ conv_rows_stack( float* img, float* kern, float* out,
template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern> template<int KERN_WIDTH, bool c_contiguous, bool preload_full_kern>
__global__ void __global__ void
conv_rows_stack2( float* img, float* kern, float* out, conv_rows_stack2( float* img, float* kern, float* out,
const int img_len, const int img_wid, const int kern_len, const int kern_wid, const int img_len, const int img_wid, const int kern_len, const int kern_wid,
const int nkern, const int nstack, const int nkern, const int nstack,
const int img_stride_col, const int img_stride_row, const int img_stride_col, const int img_stride_row,
const int img_stride_stack, const int img_stride_batch, const int img_stride_stack, const int img_stride_batch,
const int kern_stride_col, const int kern_stride_row, const int kern_stride_col, const int kern_stride_row,
const int kern_stride_stack, const int kern_stride_nkern) const int kern_stride_stack, const int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows; int __shared__ out_len, out_wid, nb_thread_id, batch_id, kern_id, nb_rows;
float __shared__ *d_img, *d_kern; float __shared__ *d_img, *d_kern;
...@@ -763,54 +763,54 @@ conv_rows_stack2( float* img, float* kern, float* out, ...@@ -763,54 +763,54 @@ conv_rows_stack2( float* img, float* kern, float* out,
__syncthreads(); __syncthreads();
load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1, load_to_shared(d_img,img+_idx2,thread_id,nb_thread_id,img_wid,nb_rows-1,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(preload_full_kern) if(preload_full_kern)
load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack, load_to_shared(d_kern, kern+kern_stride_nkern*kern_id+kern_stride_stack*stack,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
__syncthreads(); __syncthreads();
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
__syncthreads(); __syncthreads();
if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){ if((blockIdx.x*nb_rows+row+nb_rows-1)<img_len){
int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack int _idx1=img_stride_batch*batch_id+img_stride_stack*stack;//selection the good image from the batch and stack
_idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads _idx1+=(blockIdx.x*nb_rows)*img_stride_row;//select the good top row for the block of threads
_idx1+=(row+nb_rows-1)*img_stride_row;//the current last row _idx1+=(row+nb_rows-1)*img_stride_row;//the current last row
load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid, load_to_shared(d_img+((row+nb_rows-1)%nb_rows)*img_wid,
img+_idx1, thread_id, nb_thread_id, img_wid, 1, img+_idx1, thread_id, nb_thread_id, img_wid, 1,
img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer. img_stride_col, img_stride_row, false, c_contiguous);//we use d_img as a circular buffer.
} }
if(!preload_full_kern){ if(!preload_full_kern){
int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack int _idx3=kern_stride_nkern*kern_id+kern_stride_stack*stack;//selection the good kern from the batch and stack
_idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped _idx3+=(kern_len-row-1)*kern_stride_row;//the current last row flipped
load_to_shared(d_kern, kern+_idx3, load_to_shared(d_kern, kern+_idx3,
thread_id, nb_thread_id, kern_wid,1, thread_id, nb_thread_id, kern_wid,1,
kern_stride_col, kern_stride_row, true, c_contiguous); kern_stride_col, kern_stride_row, true, c_contiguous);
} }
__syncthreads(); __syncthreads();
//if needed as on Fermi as reading out of bound index from shared memory generate an error. //if needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code //Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread. //as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card. //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len){ if(out_row<out_len){
const float* idx_kern; const float* idx_kern;
if(preload_full_kern) idx_kern=&d_kern[row*kern_wid]; if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];
else idx_kern=d_kern; else idx_kern=d_kern;
const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col]; const float* idx_in=&d_img[((shared_row+row)%nb_rows)*img_wid+out_col];
float sum_ =0.0f; float sum_ =0.0f;
convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum_,idx_in,idx_kern,kern_wid);
sum+=sum_;//We pass by an intermediate variable to have more precission. sum+=sum_;//We pass by an intermediate variable to have more precission.
} }
} }
} }
__syncthreads(); __syncthreads();
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
kern_id*out_wid*out_len+//the output image kern_id*out_wid*out_len+//the output image
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
/** /**
...@@ -854,8 +854,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -854,8 +854,8 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
int img_rr = iR_logical + kern_len - 1 - rr; int img_rr = iR_logical + kern_len - 1 - rr;
int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y; int reduceIdx = threadIdx.x * blockDim.y + threadIdx.y;
float sum = 0.0f; float sum = 0.0f;
if(stack_loop){ if(stack_loop){
for (; ss < stacklen; ss+=blockDim.x){ for (; ss < stacklen; ss+=blockDim.x){
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R; float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C; float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc) for (int cc = 0; cc < kern_wid; ++cc)
...@@ -864,17 +864,17 @@ conv_valid_row_reduce(int nB, int nK, int stacklen, ...@@ -864,17 +864,17 @@ conv_valid_row_reduce(int nB, int nK, int stacklen,
kk_0 += kern_str_C; kk_0 += kern_str_C;
ii_0 -= img_str_C; ii_0 -= img_str_C;
} }
}
}else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
} }
} }else{
float * kk_0 = kern + iK*kern_str_K + ss*kern_str_S + rr*kern_str_R;
float * ii_0 = img + iB*img_str_B + ss*img_str_S + img_rr*img_str_R + (iC_logical + kern_wid - 1)*img_str_C;
for (int cc = 0; cc < kern_wid; ++cc)
{
sum += kk_0[0] * ii_0[0];
kk_0 += kern_str_C;
ii_0 -= img_str_C;
}
}
if (blockDim.x * blockDim.y == 1) if (blockDim.x * blockDim.y == 1)
{ {
...@@ -1030,3 +1030,13 @@ conv_reference_full(int nB, int nK, int stacklen, ...@@ -1030,3 +1030,13 @@ conv_reference_full(int nB, int nK, int stacklen,
} }
#endif // #ifndef CONV_KERNEL_CU #endif // #ifndef CONV_KERNEL_CU
/*
Local Variables:
mode:c++
c-basic-offset:4
c-file-style:"stroustrup"
indent-tabs-mode:nil
fill-column:79
End:
*/
// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
...@@ -4132,7 +4132,6 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self) ...@@ -4132,7 +4132,6 @@ void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
mode:c++ mode:c++
c-basic-offset:4 c-basic-offset:4
c-file-style:"stroustrup" c-file-style:"stroustrup"
c-file-offsets:((innamespace . 0)(inline-open . 0))
indent-tabs-mode:nil indent-tabs-mode:nil
fill-column:79 fill-column:79
End: End:
......
...@@ -347,7 +347,6 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self); ...@@ -347,7 +347,6 @@ static void fprint_CudaNdarray(FILE * fd, const CudaNdarray *self);
mode:c++ mode:c++
c-basic-offset:4 c-basic-offset:4
c-file-style:"stroustrup" c-file-style:"stroustrup"
c-file-offsets:((innamespace . 0)(inline-open . 0))
indent-tabs-mode:nil indent-tabs-mode:nil
fill-column:79 fill-column:79
End: End:
......
...@@ -24,12 +24,13 @@ if cuda_ndarray.cuda_available == False: ...@@ -24,12 +24,13 @@ if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
#needed as the gpu conv don't have a perform implementation. #needed as the gpu conv don't have a perform implementation.
if theano.config.mode=='FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else: else:
theano_mode = theano.compile.mode.get_default_mode().including('gpu') theano_mode = theano.compile.mode.get_default_mode().including('gpu')
cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False]*4) cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)
def py_conv_valid_numpy(img, kern): def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
...@@ -42,19 +43,27 @@ def py_conv_valid_numpy(img, kern): ...@@ -42,19 +43,27 @@ def py_conv_valid_numpy(img, kern):
for rr in xrange(out.shape[2]): for rr in xrange(out.shape[2]):
for cc in xrange(out.shape[3]): for cc in xrange(out.shape[3]):
#rr, cc is the upper-left corner of img patches #rr, cc is the upper-left corner of img patches
imgpatch = img[b,:,rr:rr+kern.shape[2], cc:cc+kern.shape[3]] imgpatch = img[b, :, rr:rr + kern.shape[2],
cc:cc + kern.shape[3]]
#print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1 #print img.shape, kern.shape, imgpatch.shape, rr+kern.shape[2]-1, rr-1, -1
innerprod = (imgpatch[:,::-1,::-1] * kern[k,:,:,:]).sum() innerprod = (imgpatch[:, ::-1, ::-1] *
kern[k, :, :, :]).sum()
out[b, k, rr, cc] = innerprod out[b, k, rr, cc] = innerprod
return out return out
def py_conv_full_numpy(img, kern): def py_conv_full_numpy(img, kern):
# manually pad the img with zeros all around, and then run it through py_conv_valid # manually pad the img with zeros all around, and then run it
pad_rows = 2*(kern.shape[2]-1) + img.shape[2] # through py_conv_valid
pad_cols = 2*(kern.shape[3]-1) + img.shape[3] pad_rows = 2 * (kern.shape[2] - 1) + img.shape[2]
padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols), dtype=img.dtype) pad_cols = 2 * (kern.shape[3] - 1) + img.shape[3]
padded_img[:,:,kern.shape[2]-1:kern.shape[2]-1+img.shape[2],kern.shape[3]-1:kern.shape[3]-1+img.shape[3]] = img padded_img = numpy.zeros((img.shape[0], img.shape[1], pad_rows, pad_cols),
dtype=img.dtype)
padded_img[:, :, kern.shape[2] - 1: kern.shape[2] - 1 + img.shape[2],
kern.shape[3] - 1: kern.shape[3] - 1 + img.shape[3]] = img
return py_conv_valid_numpy(padded_img, kern) return py_conv_valid_numpy(padded_img, kern)
def py_conv(img, kern, mode, subsample): def py_conv(img, kern, mode, subsample):
""" """
use a scipy or numpy implementation depending is scipy is available. use a scipy or numpy implementation depending is scipy is available.
...@@ -62,13 +71,16 @@ def py_conv(img, kern, mode, subsample): ...@@ -62,13 +71,16 @@ def py_conv(img, kern, mode, subsample):
""" """
if imported_scipy_convolve2d: if imported_scipy_convolve2d:
return py_conv_scipy(img, kern, mode, subsample) return py_conv_scipy(img, kern, mode, subsample)
elif mode=='valid': elif mode == 'valid':
return py_conv_valid_numpy(img,kern)[:,:,::subsample[0],::subsample[1]] return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0],
elif mode=='full': ::subsample[1]]
return py_conv_full_numpy(img,kern)[:,:,::subsample[0],::subsample[1]] elif mode == 'full':
return py_conv_full_numpy(img, kern)[:, :, ::subsample[0],
::subsample[1]]
else: else:
raise Exception("Can't execute this kernel.") raise Exception("Can't execute this kernel.")
def py_conv_scipy(img, kern, mode, subsample): def py_conv_scipy(img, kern, mode, subsample):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
if mode == 'valid': if mode == 'valid':
...@@ -83,17 +95,20 @@ def py_conv_scipy(img, kern, mode, subsample): ...@@ -83,17 +95,20 @@ def py_conv_scipy(img, kern, mode, subsample):
for b in xrange(out.shape[0]): for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]): for k in xrange(out.shape[1]):
for s in xrange(img.shape[1]): for s in xrange(img.shape[1]):
out[b,k,:,:] += convolve2d(img[b,s,:,:] out[b, k, :, :] += convolve2d(img[b, s, :, :],
, kern[k,s,:,:] kern[k, s, :, :],
, mode) mode)
return out[:,:,::subsample[0], ::subsample[1]] return out[:, :, ::subsample[0], ::subsample[1]]
def _params_allgood_header(): def _params_allgood_header():
print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup" print "ishape kshape #Mflops CPU Mflops GPU Mflops Speedup"
def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
kern_stride=(1,1), version=-1, verbose=0, random=True, print_=None, def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
id=None, rtol=1e-5, atol = 1e-8, nb_iter=0, ones=False, compile_kshp=None): kern_stride=(1, 1), version=-1, verbose=0, random=True,
print_=None, id=None, rtol=1e-5, atol=1e-8,
nb_iter=0, ones=False, compile_kshp=None):
# #
# This function is the core of several of the big unit-test drivers, # This function is the core of several of the big unit-test drivers,
# but it can also be used very directly on its own to test a specific # but it can also be used very directly on its own to test a specific
...@@ -111,22 +126,27 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ...@@ -111,22 +126,27 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
elif random: elif random:
npy_img = theano._asarray(numpy.random.rand(*ishape)+1, dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
npy_kern = theano._asarray(numpy.random.rand(*kshape)-2, dtype='float32') dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
dtype='float32')
else: else:
npy_img = theano._asarray(numpy.arange(numpy.prod(ishape)).reshape(ishape), dtype='float32')+1 npy_img = theano._asarray(numpy.arange(
npy_kern = -(theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32')+1) numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1
npy_kern = -(theano._asarray(numpy.arange(
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1)
img = cuda_ndarray.CudaNdarray(npy_img) img = cuda_ndarray.CudaNdarray(npy_img)
kern = cuda_ndarray.CudaNdarray(npy_kern) kern = cuda_ndarray.CudaNdarray(npy_kern)
#we take the stride after the transfert as we make c_contiguous data on the GPU. #we take the stride after the transfert as we make c_contiguous
if img_stride!=(1,1): #data on the GPU.
img=img[:,:,::img_stride[0],::img_stride[1]] if img_stride != (1, 1):
npy_img = npy_img[:,:,::img_stride[0],::img_stride[1]] img = img[:, :, ::img_stride[0], ::img_stride[1]]
if kern_stride!=(1,1): npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
kern=kern[:,:,::kern_stride[0],::kern_stride[1]] if kern_stride != (1, 1):
npy_kern = npy_kern[:,:,::kern_stride[0],::kern_stride[1]] kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]
t2 = None t2 = None
rval = True rval = True
...@@ -139,20 +159,23 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ...@@ -139,20 +159,23 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
subsample=subsample, subsample=subsample,
version=version, version=version,
verbose=verbose, kshp=compile_kshp)(i,k) verbose=verbose,
f=theano.function([i,k],op, mode=theano_mode) kshp=compile_kshp)(i, k)
gpuval = f(img,kern) f = theano.function([i, k], op, mode=theano_mode)
gpuval = f(img, kern)
t2 = time.time() t2 = time.time()
for i in range(nb_iter): for i in range(nb_iter):
gpuval2 = f(img,kern) gpuval2 = f(img, kern)
assert numpy.allclose(numpy.asarray(gpuval),numpy.asarray(gpuval2)) assert numpy.allclose(numpy.asarray(gpuval),
assert (numpy.asarray(gpuval)==numpy.asarray(gpuval2)).all() numpy.asarray(gpuval2))
assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
gpuval = numpy.asarray(gpuval) gpuval = numpy.asarray(gpuval)
if gpuval.shape != cpuval.shape: if gpuval.shape != cpuval.shape:
print >> sys.stdout, "ERROR: shape mismatch", gpuval.shape, cpuval.shape print >> sys.stdout, "ERROR: shape mismatch",
print >> sys.stdout, gpuval.shape, cpuval.shape
rval = False rval = False
if rval: if rval:
rval = numpy.allclose(cpuval, gpuval, rtol = rtol) rval = numpy.allclose(cpuval, gpuval, rtol=rtol)
assert numpy.all(numpy.isfinite(gpuval)) assert numpy.all(numpy.isfinite(gpuval))
except NotImplementedError, e: except NotImplementedError, e:
print >> sys.stdout, '_params_allgood Failed allclose', e print >> sys.stdout, '_params_allgood Failed allclose', e
...@@ -164,49 +187,52 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1), ...@@ -164,49 +187,52 @@ def _params_allgood(ishape, kshape, mode, subsample=(1,1), img_stride=(1,1),
else: else:
approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2 approx_fp = ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2
approx_fp /= 1e6 approx_fp /= 1e6
cpu_mflops = approx_fp / (t1-t0) cpu_mflops = approx_fp / (t1 - t0)
gpu_mflops = approx_fp / (t2-t1) gpu_mflops = approx_fp / (t2 - t1)
if verbose>0: if verbose > 0:
print >> sys.stdout, '%15s'% str(ishape), '%15s'% str(kshape), print >> sys.stdout, '%15s' % str(ishape), '%15s' % str(kshape),
print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp, print >> sys.stdout, '%12.5f %7.2f %7.2f %7.1f' % (approx_fp,
cpu_mflops, gpu_mflops,(t1-t0)/(t2-t1)) cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1))
if not rval: if not rval:
print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version print >> sys.stdout, 'test_'+mode+' id='+str(id)+' FAILED for ishape, kshape, mode, subsample, img_stride, kern_stride, version', ishape, kshape, mode, subsample, img_stride, kern_stride, version
diff=cpuval-gpuval diff = cpuval - gpuval
diffabs=numpy.absolute(diff) diffabs = numpy.absolute(diff)
pr_diff=diffabs/numpy.absolute(cpuval) pr_diff = diffabs / numpy.absolute(cpuval)
nb_close=(diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum() nb_close = (diffabs <= (atol + rtol * numpy.absolute(gpuval))).sum()
print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs) print "max absolute diff:",diffabs.max(),"avg abs diff:",numpy.average(diffabs)
print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size print "median abs diff:", numpy.median(diffabs), "nb close:",nb_close, "/", diff.size
print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff) print "max relatif diff:",pr_diff.max(), "avg rel diff:", numpy.average(pr_diff)
if not rval and print_!=False: if not rval and print_ != False:
if npy_img.shape[0]>5: if npy_img.shape[0] > 5:
print "img",npy_img[0] print "img", npy_img[0]
print "kern",npy_kern[0] print "kern", npy_kern[0]
print "gpu",gpuval[0][0] print "gpu", gpuval[0][0]
print "cpu",cpuval[0][0] print "cpu", cpuval[0][0]
print "diff",diff[0][0] print "diff", diff[0][0]
else: else:
print "img",npy_img print "img", npy_img
print "kern",npy_kern print "kern", npy_kern
print "gpu",gpuval print "gpu", gpuval
print "cpu",cpuval print "cpu", cpuval
print "diff",diff print "diff", diff
return rval return rval
def exec_conv(version, shapes, verbose, random, mode, def exec_conv(version, shapes, verbose, random, mode,
print_=None, rtol=1e-5, ones=False): print_=None, rtol=1e-5, ones=False):
if verbose>0: if verbose > 0:
_params_allgood_header() _params_allgood_header()
nb_failed = 0 nb_failed = 0
nb_tests = 0 nb_tests = 0
failed_version=set() failed_version = set()
failed_id=[] failed_id = []
for ver in version:# I put -1 in case we forget to add version in the test to. # I put -1 in case we forget to add version in the test to.
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for ver in version:
ret=False for id, (ishape, kshape, subshape,
istride, kstride) in enumerate(shapes):
ret = False
try: try:
ret = _params_allgood(ishape, ret = _params_allgood(ishape,
kshape, kshape,
...@@ -222,19 +248,21 @@ def exec_conv(version, shapes, verbose, random, mode, ...@@ -222,19 +248,21 @@ def exec_conv(version, shapes, verbose, random, mode,
rtol=rtol, rtol=rtol,
ones=ones) ones=ones)
except Exception, e: except Exception, e:
print ver, id,(ishape, kshape, subshape, istride, kstride) print ver, id, (ishape, kshape, subshape, istride, kstride)
print e print e
pass pass
if not ret: if not ret:
failed_version.add(ver) failed_version.add(ver)
failed_id.append(id) failed_id.append(id)
nb_failed+=1 nb_failed += 1
nb_tests+=1 nb_tests += 1
if nb_failed>0: if nb_failed > 0:
print "nb_failed",nb_failed,"on",nb_tests, "failed_version",failed_version, "failed_id",failed_id print "nb_failed", nb_failed, "on", nb_tests,
assert nb_failed==0, nb_failed print "failed_version", failed_version, "failed_id", failed_id
assert nb_failed == 0, nb_failed
else: else:
print 'Executed',nb_tests,'different shapes' print 'Executed', nb_tests, 'different shapes'
def get_basic_shapes(): def get_basic_shapes():
return [ return [
...@@ -249,8 +277,12 @@ def get_basic_shapes(): ...@@ -249,8 +277,12 @@ def get_basic_shapes():
, ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1)) , ((1, 1, 4, 4), (1, 1, 3, 2), (1,1), (1,1), (1,1))
, ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))] , ((1, 1, 4, 4), (1, 1, 2, 3), (1,1), (1,1), (1,1))]
def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
""" all possible case if we one or more of stack size, batch size, nkern. We use the gived image shape, kernel shape and subsmaple shape.""" def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
img_stride=(1, 1), kern_stride=(1, 1)):
""" all possible case if we one or more of stack size, batch size,
nkern. We use the gived image shape, kernel shape and subsmaple
shape."""
return [ ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only return [ ((1, 2)+imshp, (1, 2)+kshp,subsample, img_stride, kern_stride)#stack only
, ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only , ((3, 1)+imshp, (1, 1)+kshp,subsample, img_stride, kern_stride)#batch only
, ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only , ((1, 1)+imshp, (2, 1)+kshp,subsample, img_stride, kern_stride)#nkern only
...@@ -260,7 +292,10 @@ def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_ ...@@ -260,7 +292,10 @@ def get_shapes(imshp=(1,1), kshp=(1,1), subsample=(1,1), img_stride=(1,1), kern_
, ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack , ((2, 2)+imshp, (2, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
, ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack , ((3, 2)+imshp, (4, 2)+kshp,subsample, img_stride, kern_stride)#batch, nkern and stack
] ]
def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride=(1,1), kern_stride=(1,1)):
def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
img_stride=(1, 1), kern_stride=(1, 1)):
#basic test of stack, batch and nkern paramter #basic test of stack, batch and nkern paramter
shapes =get_shapes((1*scales_img[0],1*scales_img[1]), shapes =get_shapes((1*scales_img[0],1*scales_img[1]),
(1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride) (1*scales_kern[0],1*scales_kern[1]),subsample, img_stride, kern_stride)
...@@ -284,19 +319,20 @@ def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride ...@@ -284,19 +319,20 @@ def get_shapes2(scales_img=(1,1), scales_kern=(1,1), subsample=(1,1), img_stride
(2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride) (2*scales_kern[0],3*scales_kern[1]),subsample, img_stride, kern_stride)
return shapes return shapes
def get_valid_shapes(): def get_valid_shapes():
# img shape, kern shape, subsample shape # img shape, kern shape, subsample shape
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes +=get_shapes2() shapes += get_shapes2()
#test image stride #test image stride
shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
#test subsample done in a separate fct #test subsample done in a separate fct
...@@ -333,161 +369,192 @@ def get_valid_shapes(): ...@@ -333,161 +369,192 @@ def get_valid_shapes():
] ]
return shapes return shapes
def test_valid_0_2(): def test_valid_0_2():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[0,2] version = [0, 2]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if ishape[1]>1: if ishape[1] > 1:
continue continue
if (numpy.prod(ishape[2:])+numpy.prod(kshape[2:]))*4>(16*1024-150): if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_1_3_11_12(): def test_valid_1_3_11_12():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[1,3,11,12] version = [1, 3, 11, 12]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if (numpy.prod(ishape[2:])+numpy.prod(kshape[2:]))*4>(16*1024-150): if ((numpy.prod(ishape[2:]) + numpy.prod(kshape[2:])) * 4 >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_4(): def test_valid_4():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[4] version = [4]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if ishape[1]>1: if ishape[1] > 1:
continue continue
if (kshape[2]*ishape[3]*4+numpy.prod(kshape[2:])*4)>(16*1024-150): if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_5(): def test_valid_5():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[5] version = [5]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
print len(shapes) print len(shapes)
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if (kshape[2]*ishape[3]*4+numpy.prod(kshape[2:])*4)>(16*1024-150): if ((kshape[2] * ishape[3] * 4 + numpy.prod(kshape[2:]) * 4) >
(16 * 1024 - 150)):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
print len(shapes2) print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_7_8_13(): def test_valid_7_8_13():
shapes = get_valid_shapes() shapes = get_valid_shapes()
# This is to test the "new" lower shared memory usage. # This is to test the "new" lower shared memory usage.
shapes.append(((10,30,60,60),(20,30,40,40), (1,1), (1,1), (1,1))) shapes.append(((10, 30, 60, 60), (20, 30, 40, 40),
version=[7,8,13] (1, 1), (1, 1), (1, 1)))
verbose=0 version = [7, 8, 13]
verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
print len(shapes) print len(shapes)
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[2]*oshape[3]>512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[2] * oshape[3] > 512:
continue continue
if max(numpy.prod(ishape[2:])*4+2*kshape[3]*4, oshape[2]*oshape[3]*4*2)>(16*1024-150): if max(numpy.prod(ishape[2:]) * 4 + 2 * kshape[3] * 4,
oshape[2] * oshape[3] * 4 * 2) > (16 * 1024 - 150):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
print len(shapes2) print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid_9_10(): def test_valid_9_10():
shapes = get_valid_shapes() shapes = get_valid_shapes()
version=[9,10] version = [9, 10]
verbose=0 verbose = 0
random = True random = True
print_ = False print_ = False
ones = False ones = False
if ones: if ones:
random = False random = False
shapes2=[] shapes2 = []
print len(shapes) print len(shapes)
for id,(ishape, kshape, subshape, istride, kstride) in enumerate(shapes): for id, (ishape, kshape, subshape, istride, kstride) in enumerate(shapes):
oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1])) oshape = [ishape[0]] + [kshape[0]] + list(numpy.asarray(ishape[2:]) -
if oshape[3]> 512: numpy.asarray(kshape[2:]) +
numpy.asarray([1, 1]))
if oshape[3] > 512:
continue continue
if (kshape[3]*4+ishape[3])>(16*1024-150): if (kshape[3] * 4 + ishape[3]) > (16 * 1024 - 150):
continue continue
if subshape==(1,1): if subshape == (1, 1):
shapes2.append((ishape, kshape, subshape, istride, kstride)) shapes2.append((ishape, kshape, subshape, istride, kstride))
shapes = shapes2 shapes = shapes2
print len(shapes2) print len(shapes2)
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_valid(): def test_valid():
shapes = get_valid_shapes() shapes = get_valid_shapes()
...@@ -495,8 +562,8 @@ def test_valid(): ...@@ -495,8 +562,8 @@ def test_valid():
#shapes=shapes[400:426] #shapes=shapes[400:426]
# I put -1 in case we forget to add version in the test to. # I put -1 in case we forget to add version in the test to.
# I put -2 to test the reference version. # I put -2 to test the reference version.
version=[-2,-1,6] version = [-2, -1, 6]
verbose=0 verbose = 0
# version=[1] # version=[1]
random = True random = True
...@@ -505,17 +572,19 @@ def test_valid(): ...@@ -505,17 +572,19 @@ def test_valid():
if ones: if ones:
random = False random = False
exec_conv(version, shapes, verbose, random, 'valid', print_=print_, ones=ones, rtol=1.1e-5) exec_conv(version, shapes, verbose, random, 'valid',
print_=print_, ones=ones, rtol=1.1e-5)
def test_full(): def test_full():
shapes = get_basic_shapes() shapes = get_basic_shapes()
shapes +=get_shapes2() shapes += get_shapes2()
#test image stride #test image stride
shapes += get_shapes2(scales_img=(2,2),img_stride=(1,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(1, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 1))
shapes += get_shapes2(scales_img=(2,2),img_stride=(2,2)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
shapes += get_shapes2(scales_img=(2,2),img_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), img_stride=(-1, -1))
shapes += get_shapes2(scales_img=(2,2),kern_stride=(-1,-1)) shapes += get_shapes2(scales_img=(2, 2), kern_stride=(-1, -1))
#test subsample done in a separate fct #test subsample done in a separate fct
...@@ -557,13 +626,14 @@ def test_full(): ...@@ -557,13 +626,14 @@ def test_full():
] ]
# shapes=shapes[:277] # shapes=shapes[:277]
version=[-2,-1,0,1,2,3,4,5] version = [-2, -1, 0, 1, 2, 3, 4, 5]
verbose=0 verbose = 0
# version=[4] # version=[4]
random=True random = True
exec_conv(version, shapes, verbose, random, 'full') exec_conv(version, shapes, verbose, random, 'full')
def test_subsample(): def test_subsample():
# implement when # implement when
shapes = [ shapes = [
...@@ -573,14 +643,14 @@ def test_subsample(): ...@@ -573,14 +643,14 @@ def test_subsample():
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1,1), (1,1))
, ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1)) , ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1,1), (1,1))
] ]
shapes += get_shapes2(scales_img=(2,2),subsample=(1,1)) shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
shapes += get_shapes2(scales_img=(2,2),subsample=(1,2)) shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
shapes += get_shapes2(scales_img=(2,2),subsample=(2,1)) shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
shapes += get_shapes2(scales_img=(2,2),subsample=(2,2)) shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 2))
#We put only the version that implement the subsample to make the test faster. #We put only the version that implement the subsample to make the test faster.
version_valid = [-2,-1,1,3,11,12] version_valid = [-2, -1, 1, 3, 11, 12]
version_full = [-2,-1] version_full = [-2, -1]
verbose = 0 verbose = 0
random = True random = True
print_ = False print_ = False
...@@ -588,8 +658,10 @@ def test_subsample(): ...@@ -588,8 +658,10 @@ def test_subsample():
if ones: if ones:
random = False random = False
exec_conv(version_valid, shapes, verbose, random, 'valid', print_=print_, ones=ones) exec_conv(version_valid, shapes, verbose, random, 'valid',
exec_conv(version_full, shapes, verbose, random, 'full', print_=print_, ones=ones) print_=print_, ones=ones)
exec_conv(version_full, shapes, verbose, random, 'full',
print_=print_, ones=ones)
## See #616 ## See #616
#def test_logical_shapes(): #def test_logical_shapes():
...@@ -614,7 +686,8 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -614,7 +686,8 @@ class TestConv2DGPU(unittest.TestCase):
theano_mode_orig = theano_mode theano_mode_orig = theano_mode
try: try:
if theano.config.mode in ['DebugMode', 'DEBUG_MODE']: if theano.config.mode in ['DebugMode', 'DEBUG_MODE']:
theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu') theano_mode = theano.compile.mode.get_mode(
'FAST_RUN').including('gpu')
for mode in ['valid', 'full']: for mode in ['valid', 'full']:
for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)), for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)), ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
...@@ -622,16 +695,21 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -622,16 +695,21 @@ class TestConv2DGPU(unittest.TestCase):
# We use only the number of columns. # We use only the number of columns.
]: ]:
self.assertRaises(ValueError, _params_allgood, shapes[0], shapes[1], self.assertRaises(ValueError, _params_allgood,
verbose=verbose, random=random, mode=mode, shapes[0], shapes[1],
print_=print_, ones=ones, compile_kshp=shapes[2]) verbose=verbose, random=random,
mode=mode,
print_=print_, ones=ones,
compile_kshp=shapes[2])
finally: finally:
theano_mode = theano_mode_orig theano_mode = theano_mode_orig
def _test_dummy(): def _test_dummy():
ishape = (1, 1, 5, 5) ishape = (1, 1, 5, 5)
kshape = (1, 1, 3, 3) kshape = (1, 1, 3, 3)
mode = 'valid' mode = 'valid'
subsample = (1,1) subsample = (1, 1)
npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')
...@@ -696,14 +774,14 @@ def benchmark(): ...@@ -696,14 +774,14 @@ def benchmark():
,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full ,((2, 30,116,116), (20, 30, 9,9), (1,1), (1,1), (1,1))#full conv_reference_full
] ]
# shapes_valid=shapes_valid[-1:] # shapes_valid=shapes_valid[-1:]
# shapes_full=shapes_full[-1:] # shapes_full=shapes_full[-1:]
version=[-1] version = [-1]
verbose=1 verbose = 1
random=True random = True
exec_conv(version, shapes_valid, verbose, random, 'valid', print_=None, rtol=1e-3) exec_conv(version, shapes_valid, verbose, random, 'valid',
print_=None, rtol=1e-3)
exec_conv(version, shapes_full, verbose, random, 'full') exec_conv(version, shapes_full, verbose, random, 'full')
...@@ -719,5 +797,3 @@ def test_stack_rows_segfault_070312(): ...@@ -719,5 +797,3 @@ def test_stack_rows_segfault_070312():
nkern=1, bsize=1) nkern=1, bsize=1)
f = theano.function([], [], updates={out: op(img, kern)}) f = theano.function([], [], updates={out: op(img, kern)})
f() f()
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论