提交 2d1cdeaa authored 作者: James Bergstra's avatar James Bergstra

GpuConv - new refcounting rules, and rewrite of valid version=13 aka conv_patch_stack_reduce

- fixes errors in conv_patch_stack_reduce when the entire kernel doesn't fit into shared memory - lowers the shared memory requirement of conv_patch_stack_reduce
上级 616089ff
...@@ -363,9 +363,10 @@ class GpuConv(Op): ...@@ -363,9 +363,10 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,8) return (0,9) # raise this whenever modifying any of the support_code_files
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\ return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\ open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read() open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
...@@ -405,8 +406,7 @@ class GpuConv(Op): ...@@ -405,8 +406,7 @@ class GpuConv(Op):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
mode, dx, dy, version, verbose); mode, dx, dy, version, verbose);
if(%(out)s && %(out)s==out2) Py_XDECREF(%(out)s);
Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
%(out)s = out2; %(out)s = out2;
"""%sub """%sub
......
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
enum { ConvMode_FULL, ConvMode_VALID }; enum { ConvMode_FULL, ConvMode_VALID };
PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose); PyObject * CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, CudaNdarray * out, const int mode, const int subsample_rows, const int subsample_cols, const int version, const int verbose);
...@@ -425,7 +427,10 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -425,7 +427,10 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
} }
} }
//version 8 is the same but we force the split. The split is need in case we have too much threads. This happen frequently if the kernel length is big. Big kernel is frequent in the gradient. //version 8 is the same but we force the split.
// The split is need in case we have too much threads.
// This happen frequently if the kernel length is big.
// Big kernel is frequent in the gradient.
//version 8 need a minimum of kernel length as we force the split. //version 8 need a minimum of kernel length as we force the split.
//version 8 is needed to test more easily this kernel template parameter. //version 8 is needed to test more easily this kernel template parameter.
//version 13 load only 1 kernel row at a time. //version 13 load only 1 kernel row at a time.
...@@ -434,8 +439,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -434,8 +439,8 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
out_size<512 &&//Maximum of 512 theads by block out_size<512 &&//Maximum of 512 theads by block
(version==7||version==8||version==13||version==-1) && (version==7||version==8||version==13||version==-1) &&
(version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split. (version!=8||kern_len>1) && //version 8 need a minimal kernel length as big as the split.
(version!=13||kern_len>1) && //version 13 need a minimal kernel length as big as the split. //version 13 need a minimal kernel length as big as the split.
(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce! (version!=13||kern_len>1) &&
!work_complete) //conv_patch_stack_reduce !work_complete) //conv_patch_stack_reduce
{ {
int nb_split=1; int nb_split=1;
...@@ -443,83 +448,99 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -443,83 +448,99 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
if(version==8||version==13) nb_split++;//force the split. if(version==8||version==13) nb_split++;//force the split.
if(version==13)full_kern=false; if(version==13)full_kern=false;
while(ceil_intdiv(kern_len,nb_split)>64)nb_split++;//device 1.3 have a max of 64 thread in z
while(out_size*ceil_intdiv(kern_len,nb_split)>512)nb_split++;
int shared_size=(img_size + kern_size + out_size*kern_len)*sizeof(float);
if(shared_size>=shared_avail){
//if we can't fit the kernel in shared memory, we can split it more.
full_kern=false;
assert((img_size+kern_wid*2+out_size*2)*sizeof(float)<=shared_avail);
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
while(shared_size>=shared_avail || ceil_intdiv(kern_len,nb_split)>64){
nb_split++;
shared_size=(img_size + kern_wid*ceil_intdiv(kern_len,nb_split) + out_size*ceil_intdiv(kern_len,nb_split))*sizeof(float);
}
}
int thread_z=ceil_intdiv(kern_len,nb_split); //thread_z is going to be ceil_intdiv(kern_len, nb_split)
assert(thread_z>0);//should not happen, but in case... // we need enough splits so that
assert(shared_size<=shared_avail); // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)
if(!full_kern) // b) thread_z * out_len * out_wid fits in the thread count
assert(thread_z!=kern_len); // c) the kernel doesn't need too much shared memory
dim3 threads(out_wid, out_len, thread_z); // constraint (a)
dim3 grid(nbatch,nkern); // device 1.3 have a max of 64 thread in z
while(ceil_intdiv(kern_len,nb_split)>64) nb_split++;
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len; // constraint (b)
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped; // (TODO: read the number of threads per block from the device)
while(out_size*ceil_intdiv(kern_len,nb_split)>512) nb_split++;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern); // tentative estimates (prior to contraint c)
//We will always be split when we don't load the full kernel int thread_z=ceil_intdiv(kern_len,nb_split);
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \ int shared_size = sizeof(float)*(full_kern
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\ ? std::max(img_size + kern_size, out_size*thread_z)
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\ : std::max(img_size + thread_z*kern_wid, out_size*thread_z));
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\ // constraint (c)
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\ while ((shared_size >= shared_avail) && (nb_split <= kern_len)){
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\ //if we can't fit the kernel in shared memory, we must split it more.
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\ nb_split++;
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\ thread_z=ceil_intdiv(kern_len,nb_split);
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\ shared_size=sizeof(float)*std::max(
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\ img_size + kern_wid*thread_z,
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\ out_size*thread_z);
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) printf("INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
kern_flipped,ccontig,nb_split,full_kern);
if (verbose>1) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
} }
else if (nb_split <= kern_len)
{ {
if (verbose) printf("threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z); assert(thread_z>0);//should not happen, but in case...
if (verbose) printf("INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n", if(!full_kern) assert(thread_z!=kern_len);
cudaGetErrorString(sts));
} dim3 threads(out_wid, out_len, thread_z);
dim3 grid(nbatch,nkern);
void (*f)(float*, float*, float*,
int, int, int, int,
int, int, int, int,
int, int,
int, int,
int, int);
const bool split=thread_z!=kern_len;
const bool ccontig=img_contiguous_2d && kern_contiguous_2d_unflipped;
//printf("kern_flipped=%d, ccontig=%d, split=%d, full_kern=%d\n",kern_flipped,ccontig,split,full_kern);
//We will always be split when we don't load the full kernel
#define CONV_PATCH_STACK_REDUCE_SPECIAL(kern_wid) \
if (kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, true>;\
else if(kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, true>;\
else if(kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, true>;\
else if(kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, true>;\
else if(!kern_flipped && ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, true>;\
else if(!kern_flipped && !ccontig && !split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, true>;\
else if(!kern_flipped && ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, true>;\
else if(!kern_flipped && !ccontig && split && full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, true>;\
/*else if(kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, false, false>;*/\
/*else if(kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, false, false>;*/\
else if(kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,true, true, false>;\
else if(kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<true,kern_wid,false, true, false>;\
/*else if(!kern_flipped && ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, false, false>;*/\
/*else if(!kern_flipped && !ccontig && !split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, false, false>;*/\
else if(!kern_flipped && ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,true, true, false>;\
else if(!kern_flipped && !ccontig && split && !full_kern) f=conv_patch_stack_reduce<false,kern_wid,false, true, false>;
CONV_PATCH_STACK_REDUCE_SPECIAL(THEANO_KERN_WID);
if (verbose) fprintf(stderr, "INFO: using 'conv_patch_stack_reduce' version kern_flipped=%i ccontig=%i nb_split=%d, preload_full_kern=%d\n",
kern_flipped,ccontig,nb_split,full_kern);
if (verbose>1) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n",
threads.x, threads.y, threads.z, grid.x, grid.y,
shared_size, threads.x * threads.y * threads.z);
f<<< grid, threads, shared_size>>>(img->devdata, kern_data_unflipped, out->devdata,
img_len, img_wid, kern_len, kern_wid,
nkern, nstack,
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch,
kern_stride_col_unflipped, kern_stride_row_unflipped,
kern_stride_stack, kern_stride_nkern);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess == sts)
{
work_complete = true;
}
else
{
if (verbose) fprintf(stderr, "threads.x=%i, threads.y=%i, threads.z=%i, grid.x=%i, grid.y=%i,shared_size=%i, nb_threads=%i\n", threads.x, threads.y, threads.z, grid.x, grid.y, shared_size, threads.x * threads.y * threads.z);
if (verbose) fprintf(stderr, "INFO: impl 'conv_patch_stack_reduce' failed (%s), trying next implementation\n",
cudaGetErrorString(sts));
}
} // else no good nb_splits was found
} }
if (1 && (version==6||version==-1) && if (1 && (version==6||version==-1) &&
...@@ -647,6 +668,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -647,6 +668,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
return -1; return -1;
} }
} }
assert (work_complete);
return 0; return 0;
//PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err)); //PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
...@@ -1043,6 +1065,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1043,6 +1065,9 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
const int subsample_rows, const int subsample_cols, const int subsample_rows, const int subsample_cols,
const int version, const int verbose) const int version, const int verbose)
{ {
// Re-use the out object if possible. If the out object it not used, then its refcount is not modified.
// If the out object is re-used then it is returned, and its refcount is incremented by 1.
//
if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (img->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;} if (kern->nd != 4) { PyErr_SetString(PyExc_ValueError, "CudaNdarray 4-D tensor required"); return NULL;}
...@@ -1062,33 +1087,36 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern, ...@@ -1062,33 +1087,36 @@ CudaNdarray_Conv(CudaNdarray *img, CudaNdarray * kern,
} }
out_dim[2] = ceil_intdiv(logical_rows, subsample_rows); out_dim[2] = ceil_intdiv(logical_rows, subsample_rows);
out_dim[3] = ceil_intdiv(logical_cols, subsample_cols); out_dim[3] = ceil_intdiv(logical_cols, subsample_cols);
CudaNdarray * rval = out; CudaNdarray * rval = NULL;
if(!(out && out->nd==4 && CudaNdarray_is_c_contiguous(out)
if ( out
&& out->nd==4
&& CudaNdarray_is_c_contiguous(out)
&& CudaNdarray_HOST_DIMS(out)[0]==out_dim[0] && CudaNdarray_HOST_DIMS(out)[0]==out_dim[0]
&& CudaNdarray_HOST_DIMS(out)[1]==out_dim[1] && CudaNdarray_HOST_DIMS(out)[1]==out_dim[1]
&& CudaNdarray_HOST_DIMS(out)[2]==out_dim[2] && CudaNdarray_HOST_DIMS(out)[2]==out_dim[2]
&& CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])){ && CudaNdarray_HOST_DIMS(out)[3]==out_dim[3])
if (out) {
{ rval = out;
fprintf(stderr, "Warning: Conv is ignoring 'out' argument with wrong structure.\n"); Py_INCREF(rval);
Py_DECREF(out); }
} else
{
if (verbose) fprintf(stderr, "INFO: Conv is ignoring 'out' argument with wrong structure.\n");
rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim); rval = (CudaNdarray*)CudaNdarray_NewDims(4,out_dim);
//rval might be null
} }
if ((rval==NULL) if ((rval==NULL)
|| ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) || ((mode==ConvMode_VALID) && CudaNdarray_conv_valid(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
|| ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose)) || ((mode==ConvMode_FULL) && CudaNdarray_conv_full(img, kern, rval, subsample_rows, subsample_cols, version, verbose))
) )
{ {
// if rval is something we just allocated, // if rval is something we just allocated,
// and there was a problem, then we have to free it. // and there was a problem, then we have to free it.
if (rval != out) Py_XDECREF(rval); Py_XDECREF(rval);
return NULL; return NULL;
} }
//TODO: Get refcount story clearer!
// This function does a weird thing as work-around with Conv_VARARGS
if (rval == out) Py_INCREF(rval);
return (PyObject*)rval; return (PyObject*)rval;
} }
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
//implement the valid convolution only //implement the valid convolution only
/* /*
...@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) { ...@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
#define BS(i, j) Bs[i][j] #define BS(i, j) Bs[i][j]
#endif #endif
*/ */
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float)) #define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
...@@ -45,8 +49,9 @@ const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the tr ...@@ -45,8 +49,9 @@ const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the tr
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){ __device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64) if (nb_thread < 64)
{ {
if(flipped) if(flipped)
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped. //TODO very slow on device before 1.3.
// make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread) for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i]; dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i]; //dst[N-1-i]=src[i];
...@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_ ...@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
const bool flipped=false, const bool c_contiguous=true){ const bool flipped=false, const bool c_contiguous=true){
if(flipped && ! c_contiguous){ if(flipped && ! c_contiguous){
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread) for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col]; dst[nb_row*nb_col-1-i]=src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
}else if(c_contiguous){ }else if(c_contiguous){
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped); load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}else if(flipped){//c_contiguous==true }else if(flipped){//c_contiguous==true
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped. //TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
int N=nb_col*nb_row; int N=nb_col*nb_row;
...@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int kern_stride_col, int kern_stride_row, int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern) int kern_stride_stack, int kern_stride_nkern)
{ {
int __shared__ out_len, out_wid, nb_thread_id; //int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1; //out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1; //out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x; const int out_wid = blockDim.x;
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[]; extern __shared__ float s_data[];
...@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int out_row = ty;//output row int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx; const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID]; //d_img size [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]; float * d_img=&s_data[0];
float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
//d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
float * d_kern=&s_data[img_len * img_wid];
//d_reduce size [n_threads]
//N.B. this overlaps with d_img and d_kern!
float * d_reduce=&s_data[0];
float sum = 0.0f; float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern kern+=kern_stride_nkern*blockIdx.y;//the good nkern
...@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
__syncthreads(); __syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len, load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous); img_stride_col, img_stride_row, false, c_contiguous);
if(!(split && ! preload_full_kern))
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
if(split && ! preload_full_kern){ if(split && ! preload_full_kern){
for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){ for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
int idx3; //N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
//TODO: test/check for flipped_kern // (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
if(flipped_kern) // details, but that seemed really weird. tricky bug to find too.
idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped int idx3 = flipped_kern
else ? max((kern_len - (int)blockDim.z - first_row),0)
idx3=first_row; : first_row;
int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads(); __syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z, load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
const float* idx_kern=&d_kern[tz*kern_stride_row]; const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0; float sum2 = 0;
if(row<kern_len) if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2; sum+=sum2;
} }
}else if(split){ }else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){ for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
...@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out, ...@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int row = tz;//The row of the kernel. int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid); convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
} }
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
} }
//reduce //reduce no sync because previous loop ends with sync
d_reduce[thread_id]=sum; d_reduce[thread_id]=sum;
__syncthreads(); __syncthreads();
if(thread_id<out_len*out_wid){ if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
sum=0; //sum=0;
for(int i=0;i<blockDim.z;i++){ for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y]; sum+=d_reduce[thread_id+i*out_wid*out_len];
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image out_wid*out_len*blockIdx.y+//the output image
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论