for (int col=0; col < kern_wid; col++) {//loop over col
sum+=data[col]*kern[col];
}
}
__device__ void fill(float * dst, int N, float value, int thread_id, int nb_thread){
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=value;
}
template <typename T>
static T ceil_intdiv(T a, T b)
{
return (a/b) + ((a %% b) ? 1: 0);
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more then the 10/16 max for each thread and thus this could lower the occupency.
* Implementation of the valid convolution that keep the full image and the full kernel in shared memory
* each thread compute only one value for the output if split is true. Otherwise compute ceil((float)out_len/N) pixel.
* thread block size=out_wid, nb_rows (optimized value is ceil(out_len/N))
* grid block size=batch_id, nkern
* dynamic shared memory: full mem: (img_len+2*kern_len-2)*(img_wid+2*kern_wid-2)+kern_len*kern_wid
const int len_to_load=min(%(kern_len)s+nb_rows,%(img_len)s-out_row_iter*nb_rows);//nb rows to load, min(nb_rows for this iter, nb rows left in the image)
const int empty_row = max(%(kern_len)s-1-out_row_iter*nb_rows,0);//number of empty row at the start
//we need to reload some row as when we change of out_row we lost the last load du to the stack.
const int previous_row = min(out_row_iter*nb_rows,%(kern_len)s-1);//number of row from last out_row iteration to reload
ifFalseandself.subsample==(1,1)andself.border_mode=='full'andself.versionin[3,4,5,-1]andout_dim_3<=512and((self.logical_kern_hw[0]+2*self.logical_kern_hw[0]-2)*img_wid_padded*4+self.logical_kern_hw[0]*self.logical_kern_hw[1]*4<(16*1024-128))andout_.dtype=='float32'andkern_.dtype=='float32'andimg_.dtype=='float32':#-128 as this is the number of shared memory used statically
int version = %(version)s;
return"""
int verbose = %(verbose)s;
int dx = %(dx)s;
int dy = %(dy)s;
CudaNdarray* img = %(img)s;
int mode;
CudaNdarray* kern = %(kern)s;
if (strcmp(mode_str, "full") == 0)
CudaNdarray* out_ = %(out)s;
CudaNdarray* out = out;
int version = %(version)s;
const int verbose = %(verbose)s;
if (!img || img->nd != 4)
{
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
mode = ConvMode_FULL;
return -1;
}
}
if (! kern || kern->nd != 4)
else if (strcmp(mode_str, "valid") == 0)
{
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
mode = ConvMode_VALID;
return -1;
}
int out_dim[4]={CudaNdarray_HOST_DIMS(img)[0], CudaNdarray_HOST_DIMS(kern)[0],
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((%(kern_len)s+ceil_intdiv(out_len,nb_split)-1)+2*%(kern_len)s-2)*%(img_wid_padded)s*sizeof(float) + kern_size_byte)>16*1024) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
{
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
case 1: CONV_PATCH_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_PATCH_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_PATCH_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_PATCH_SPECIAL(4); break;
case 5: CONV_PATCH_SPECIAL(5); break;
case 6: CONV_PATCH_SPECIAL(6); break;
case 7: CONV_PATCH_SPECIAL(7); break;
case 10: CONV_PATCH_SPECIAL(10); break;
#endif
default:
if(!msgdisplayed_conv_patch__kern_width) {
printf("OPTIMISATION WARNING: conv_patch template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
case 12: CONV_PATCH_STACK_SPECIAL(12); break;//on cifar10
case 21: CONV_PATCH_STACK_SPECIAL(21); break;//on cifar10
case 23: CONV_PATCH_STACK_SPECIAL(23); break;//test_nnet.py:test_lenet_64
case 24: CONV_PATCH_STACK_SPECIAL(24); break;//on cifar10
case 25: CONV_PATCH_STACK_SPECIAL(25); break;//on cifar10
case 28: CONV_PATCH_STACK_SPECIAL(28); break;
case 32: CONV_PATCH_STACK_SPECIAL(32); break;// Alex speed example
case 45: CONV_PATCH_STACK_SPECIAL(45); break;//used by test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_patch_stack__kern_width) {
printf("OPTIMISATION HINT: conv_patch_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_ROWS_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_SPECIAL(5); break;//test_conv.py:test_valid
// case 6: CONV_ROWS_SPECIAL(6); break;
case 7: CONV_ROWS_SPECIAL(7); break;//used by test_nnet.py:test_lenet_108
// case 8: CONV_ROWS_SPECIAL(8); break;
case 9: CONV_ROWS_SPECIAL(9); break;//used by test_nnet.py:test_lenet_256
case 10: CONV_ROWS_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 28: CONV_ROWS_SPECIAL(28); break;
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows__kern_width){
printf("OPTIMISATION HINT: conv_rows template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
else f = conv_rows_stack<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_STACK_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_STACK_SPECIAL(2); break;
case 3: CONV_ROWS_STACK_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_STACK_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_STACK_SPECIAL(5); break;//test_conv.py:test_valid
case 6: CONV_ROWS_STACK_SPECIAL(6); break;//test_conv.py:test_valid
case 7: CONV_ROWS_STACK_SPECIAL(7); break;//test_nnet.py:test_lenet_108
case 8: CONV_ROWS_STACK_SPECIAL(8); break;//test_conv.py:test_valid
case 9: CONV_ROWS_STACK_SPECIAL(9); break;//test_nnet.py:test_lenet_256
case 10: CONV_ROWS_STACK_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 23: CONV_ROWS_STACK_SPECIAL(23); break;//test_conv.py:test_valid
case 24: CONV_ROWS_STACK_SPECIAL(24); break;//test_conv.py:test_valid
case 28: CONV_ROWS_STACK_SPECIAL(28); break;//test_conv.py:test_valid
case 45: CONV_ROWS_STACK_SPECIAL(45); break;//test_nnet.py:test_lenet_64
case 102: CONV_ROWS_STACK_SPECIAL(102); break;//test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows_stack__kern_width){
printf("OPTIMISATION HINT: conv_rows_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) //conv_patch_stack_reduce
{
int nb_split=1;
int full_kern=true;
if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false;
while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
printf("OPTIMISATION HINT: conv_patch_stack_reduce template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
int work_complete = 0;
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (0)
{
//TODO: rethink these to use physical / logical dimensions, subsampling, offsets, etc.
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
printf("OPTIMISATION HINT: conv_full_patch_stack_padded template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//case 10: f = conv_full_load_everything<10>; break;
//case 30: f = conv_full_load_everything<30>; break; //This is actually slower than the general version??
#endif
default:
printf("OPTIMISATION HINT: conv_full_load_everything template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
f = conv_full_load_everything<0>;
};
f<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
{"conv", (PyCFunction)CudaNdarray_Conv_VARARGS, METH_VARARGS|METH_KEYWORDS, "Returns the 2D convolution of one CudaNdarray argument with another. WRITEME"},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
{"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Allow to select the gpu card to use."},
{"filter", filter, METH_VARARGS, "no doc!"},
{"filter", filter, METH_VARARGS, "no doc!"},
{NULL, NULL, NULL, NULL} /* Sentinel */
{NULL, NULL, NULL, NULL} /* Sentinel */
...
@@ -2379,1273 +2283,3 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
...
@@ -2379,1273 +2283,3 @@ CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern
//if we remove the restriction img_size_byte+kern_size_byte>8*1024, we can enter in condition where we will lower the occupency due to shared memory and/or registers.
if ((version == -1) && (out_size<64 || img_size_byte+kern_size_byte>8*1024) && out_size<=256){
//condition for exec
if(!subsample &&
out_contiguous &&
out_size<512 &&//Maximum of 512 theads by block
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete)
version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
}
if (!subsample && c_contiguous &&
(version==0||version==2||version==-1) &&
out_wid<512 &&//Maximum of 512 theads by block
nstack == 1 &&// don't implement the stack in the kernel.
img_size_byte+kern_size_byte<shared_avail && //their is only 16k of shared memory
!work_complete) //conv_patch
{
int nb_split=1;//The number of split (i.e. the number of output pixel each thread compute.)
if(version==2 && out_len>1)nb_split++;//to force the use of split=true when testing.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
while (ceil_intdiv(out_len,nb_split)*out_wid>512) nb_split++;
case 1: CONV_PATCH_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_PATCH_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_PATCH_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_PATCH_SPECIAL(4); break;
case 5: CONV_PATCH_SPECIAL(5); break;
case 6: CONV_PATCH_SPECIAL(6); break;
case 7: CONV_PATCH_SPECIAL(7); break;
case 10: CONV_PATCH_SPECIAL(10); break;
#endif
default:
if(!msgdisplayed_conv_patch__kern_width) {
printf("OPTIMISATION WARNING: conv_patch template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
case 12: CONV_PATCH_STACK_SPECIAL(12); break;//on cifar10
case 21: CONV_PATCH_STACK_SPECIAL(21); break;//on cifar10
case 23: CONV_PATCH_STACK_SPECIAL(23); break;//test_nnet.py:test_lenet_64
case 24: CONV_PATCH_STACK_SPECIAL(24); break;//on cifar10
case 25: CONV_PATCH_STACK_SPECIAL(25); break;//on cifar10
case 28: CONV_PATCH_STACK_SPECIAL(28); break;
case 32: CONV_PATCH_STACK_SPECIAL(32); break;// Alex speed example
case 45: CONV_PATCH_STACK_SPECIAL(45); break;//used by test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_patch_stack__kern_width) {
printf("OPTIMISATION HINT: conv_patch_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
if (verbose) printf("INFO: impl 'conv_patch_stack' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (!subsample && out_contiguous &&
(version==4||version==-1) &&
out_wid<512 &&//Maximum of 512 threads by block
nstack == 1 &&// don't implement the stack in the kernel.
kern_len*img_wid*sizeof(float)+kern_size_byte<shared_avail &&//their is only 16k of shared memory
!work_complete) //conv_rows
{
dim3 threads(out_wid);
dim3 grid(out_len, nbatch*nkern);
int shared_size=(kern_len*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows<kern_wid, false>;\
else f = conv_rows<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_SPECIAL(2); break;//test_conv.py:test_valid
case 3: CONV_ROWS_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_SPECIAL(5); break;//test_conv.py:test_valid
// case 6: CONV_ROWS_SPECIAL(6); break;
case 7: CONV_ROWS_SPECIAL(7); break;//used by test_nnet.py:test_lenet_108
// case 8: CONV_ROWS_SPECIAL(8); break;
case 9: CONV_ROWS_SPECIAL(9); break;//used by test_nnet.py:test_lenet_256
case 10: CONV_ROWS_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 28: CONV_ROWS_SPECIAL(28); break;
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows__kern_width){
printf("OPTIMISATION HINT: conv_rows template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
int shared_size=((kern_len+nb_row-1)*img_wid + kern_size)*sizeof(float);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int, int, int,
int, int);
#define CONV_ROWS_STACK_SPECIAL(kern_wid) \
if(!img_contiguous_2d || !kern_contiguous_2d) f = conv_rows_stack<kern_wid, false>;\
else f = conv_rows_stack<kern_wid, true>;\
switch(kern_wid){
#ifdef UNROLL_LOOP
case 1: CONV_ROWS_STACK_SPECIAL(1); break;//test_conv.py:test_valid
case 2: CONV_ROWS_STACK_SPECIAL(2); break;
case 3: CONV_ROWS_STACK_SPECIAL(3); break;//test_conv.py:test_valid
case 4: CONV_ROWS_STACK_SPECIAL(4); break;//test_conv.py:test_valid
case 5: CONV_ROWS_STACK_SPECIAL(5); break;//test_conv.py:test_valid
case 6: CONV_ROWS_STACK_SPECIAL(6); break;//test_conv.py:test_valid
case 7: CONV_ROWS_STACK_SPECIAL(7); break;//test_nnet.py:test_lenet_108
case 8: CONV_ROWS_STACK_SPECIAL(8); break;//test_conv.py:test_valid
case 9: CONV_ROWS_STACK_SPECIAL(9); break;//test_nnet.py:test_lenet_256
case 10: CONV_ROWS_STACK_SPECIAL(10); break;//test_conv.py:test_valid
//////// Special cases
case 23: CONV_ROWS_STACK_SPECIAL(23); break;//test_conv.py:test_valid
case 24: CONV_ROWS_STACK_SPECIAL(24); break;//test_conv.py:test_valid
case 28: CONV_ROWS_STACK_SPECIAL(28); break;//test_conv.py:test_valid
case 45: CONV_ROWS_STACK_SPECIAL(45); break;//test_nnet.py:test_lenet_64
case 102: CONV_ROWS_STACK_SPECIAL(102); break;//test_nnet.py:test_lenet_108
#endif
//////// default case
default:
if(!msgdisplayed_conv_rows_stack__kern_width){
printf("OPTIMISATION HINT: conv_rows_stack template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
if (verbose) printf("INFO: used 'conv_rows_stack2' version %s with %d row(s).\n",(version==9?"'load full kernel'":"'load 1 kern row at a time'"),nb_row);
}
else
{
if (verbose) printf("threads.x=%i, threads.y=%i, grid.x=%i, grid.y=%i, shared_size=%i, nb_threads=%i version=%d\n",
if (verbose) printf("INFO: impl 'conv_rows_stack2' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter.
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
!work_complete) //conv_patch_stack_reduce
{
int nb_split=1;
int full_kern=true;
if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false;
while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
printf("OPTIMISATION HINT: conv_patch_stack_reduce template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
CudaNdarray_conv_full(const CudaNdarray *img, const CudaNdarray * kern, CudaNdarray * out, int subsample_rows, int subsample_cols, int version = -1, int verbose=0)
{
const int shared_avail = SHARED_SIZE-150;//144 is the biggest static shared size used with compiling this file.
int work_complete = 0;
if (img->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required img of 4D");
return -1;
}
if (kern->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required kern of 4D");
return -1;
}
if (out->nd != 4)
{
PyErr_SetString(PyExc_ValueError, "required out of 4D");
return -1;
}
if (0)
{
//TODO: rethink these to use physical / logical dimensions, subsampling, offsets, etc.
bool img_batch_stack_contiguous = (img_stride_stack==img_stride_row*img_len) && (img_stride_batch==img_stride_stack*nstack);//don't support stride for nbatch and nstack
//if the lower 2 dims are c_contiguous but flipped, unflipping the stride and not flipping the kernel in shared memroy
//allow to use a version that use less registers(so is faster)
//the unflipped version of variable have the original value when we don't need to unflip it, but have the new value when we unflip it.
//we pass by ceil_intdiv in case the out_len is not a multiple of nb_split, we want nb_split the number of iteration.
//Max of 16k of shared memory
if(version==5)
while ((((kern_len+ceil_intdiv(out_len,nb_split)-1)+2*kern_len-2)*img_wid_padded*sizeof(float) + kern_size_byte)>shared_avail) nb_split++;
//327 as we use 25 register
//version 5 will have only 1 block running at a time, so we can use 32 registers per threads, but their is some other stuff that for the limit to bu lower then 512.
int max_thread = (version!=5?327:450);
while (ceil_intdiv(out_len,nb_split)*out_wid>max_thread) nb_split++;
if(version==-1 && out_size>512)version=4;
if(version==-1)version=3;
if(version==-1 && nb_split>1) version=4;
else if(version==-1) version=3;
else if(version==3 && nb_split!=1) version=4;//we force version 4 when we need more then 1 split as to be always execute.
printf("OPTIMISATION HINT: conv_full_patch_stack_padded template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
if (verbose) printf("INFO: impl 'conv_full_patch' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
}
if (false && !subsample && //disabled as test fail for this kernel
(version==1||version==-1) &&
out_size<512 &&//Maximum of 512 theads by block
(nbatch > 20 || version==1) && // we only launch nbatch blocks, so make sure there is enough to be worth it, but if we specify the version, this check should not be done to allow testing.
nstack*img_size_byte+nstack*kern_size_byte<shared_avail && //there is only 16k of shared memory
!work_complete) //conv_full_load_everything
{
dim3 threads(out_wid, out_len);
dim3 grid(nbatch);
int shared_size=(img_size + kern_size)*nstack*sizeof(float);
//TODO assert c_continious for img, kern and out in the 2 inner dimensions.
//case 10: f = conv_full_load_everything<10>; break;
//case 30: f = conv_full_load_everything<30>; break; //This is actually slower than the general version??
#endif
default:
printf("OPTIMISATION HINT: conv_full_load_everything template default add kern_wid=%d in %s at line %i to have an optimized version for your kern_wid\n", kern_wid, __FILE__, __LINE__);
f = conv_full_load_everything<0>;
};
f<<< grid, threads, shared_size>>>
(img->devdata,
kern->devdata,
out->devdata,
img_len, img_wid,
kern_len, kern_wid,
nkern, nstack,
CudaNdarray_HOST_STRIDES(img)[3],
CudaNdarray_HOST_STRIDES(img)[2],
CudaNdarray_HOST_STRIDES(img)[1],
CudaNdarray_HOST_STRIDES(img)[0],
CudaNdarray_HOST_STRIDES(kern)[3],
CudaNdarray_HOST_STRIDES(kern)[2],
CudaNdarray_HOST_STRIDES(kern)[1],
CudaNdarray_HOST_STRIDES(kern)[0]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
if (verbose) printf("INFO: used 'conv_full_load_everything' version\n");