提交 2d1cdeaa authored 作者: James Bergstra's avatar James Bergstra

GpuConv - new refcounting rules, and rewrite of valid version=13 aka conv_patch_stack_reduce

- fixes errors in conv_patch_stack_reduce when the entire kernel doesn't fit into shared memory - lowers the shared memory requirement of conv_patch_stack_reduce
上级 616089ff
......@@ -363,9 +363,10 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self):
return (0,8)
return (0,9) # raise this whenever modifying any of the support_code_files
def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\
open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read()
......@@ -405,8 +406,7 @@ class GpuConv(Op):
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s,
mode, dx, dy, version, verbose);
if(%(out)s && %(out)s==out2)
Py_DECREF(out2);//CudaNdarray_Conv incremented the count to out
Py_XDECREF(%(out)s);
%(out)s = out2;
"""%sub
......
// REMEMBER TO RAISE c_code_cache_version when changing this file
//
//implement the valid convolution only
/*
......@@ -38,6 +40,8 @@ for (int iter_m=0; iter_m < Os[0]; iter_m++) {
#define BS(i, j) Bs[i][j]
#endif
*/
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))
const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the trailing bits of pointers
#define MASKED_OFFSET(src) (((int)((unsigned long int)src - (((unsigned long int)src) & COALESCED_ALIGN))) / sizeof(float))
......@@ -45,8 +49,9 @@ const unsigned long int COALESCED_ALIGN = 0xFFFFFFFFFFFFFF00; // zero-out the tr
__device__ void load_to_shared(float * dst, const float * src, const int thread_id, int nb_thread, const int N, const bool flipped=false){
if (nb_thread < 64)
{
if(flipped)
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
if(flipped)
//TODO very slow on device before 1.3.
// make access to kern sequential and access to d_kern flipped.
for(int i=thread_id;i<N;i+=nb_thread)
dst[i]=src[N - 1 - i];
//dst[N-1-i]=src[i];
......@@ -88,10 +93,9 @@ __device__ void load_to_shared(float * dst, const float * src, const int thread_
const bool flipped=false, const bool c_contiguous=true){
if(flipped && ! c_contiguous){
for(int i=thread_id;i<nb_row*nb_col;i+=nb_thread)
dst[nb_row*nb_col-1-i]=src[i/nb_col*stride_row+i%nb_col*stride_col];
dst[nb_row*nb_col-1-i]=src[(i/nb_col)*stride_row+(i%nb_col)*stride_col];
}else if(c_contiguous){
load_to_shared(dst, src, thread_id, nb_thread, nb_col*nb_row, flipped);
}else if(flipped){//c_contiguous==true
//TODO very slow on device before 1.3. make access to kern sequential and access to d_kern flipped.
int N=nb_col*nb_row;
......@@ -440,10 +444,12 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int kern_stride_col, int kern_stride_row,
int kern_stride_stack, int kern_stride_nkern)
{
int __shared__ out_len, out_wid, nb_thread_id;
out_len = img_len - kern_len + 1;
out_wid = img_wid - kern_wid + 1;
nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
//int __shared__ out_len, out_wid, nb_thread_id;
//out_len = img_len - kern_len + 1;
//out_wid = img_wid - kern_wid + 1;
const int out_wid = blockDim.x;
const int out_len = blockDim.y;
const int nb_thread_id = blockDim.z*blockDim.y*blockDim.x;
extern __shared__ float s_data[];
......@@ -458,9 +464,16 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int out_row = ty;//output row
const int thread_id = tz*blockDim.y*blockDim.x+ty*blockDim.x+tx;
float * d_img=&s_data[0];//size of [IMAGE_LEN * IMAGE_WID];
float * d_kern=&s_data[img_len * img_wid];//size of [(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID];
float * d_reduce=&s_data[img_len*img_wid+(preload_full_kern?kern_len:blockDim.z)*kern_wid];
//d_img size [IMAGE_LEN * IMAGE_WID];
float * d_img=&s_data[0];
//d_kern size[(preload_full_kern?KERNEL_LEN:blockDim.z) * KERNEL_WID]
float * d_kern=&s_data[img_len * img_wid];
//d_reduce size [n_threads]
//N.B. this overlaps with d_img and d_kern!
float * d_reduce=&s_data[0];
float sum = 0.0f;
kern+=kern_stride_nkern*blockIdx.y;//the good nkern
......@@ -471,30 +484,31 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
__syncthreads();
load_to_shared(d_img, img, thread_id, nb_thread_id, img_wid, img_len,
img_stride_col, img_stride_row, false, c_contiguous);
if(!(split && ! preload_full_kern))
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
if(split && ! preload_full_kern){
for(int first_row=0, row=tz;first_row<kern_len;row+=blockDim.z, first_row+=blockDim.z){
int idx3;
//TODO: test/check for flipped_kern
if(flipped_kern)
idx3=(kern_len-(first_row)-blockDim.z);//the current last row flipped
else
idx3=first_row;
for(int first_row=0;first_row<kern_len;first_row+=blockDim.z){
//N.B. - Jan 30, 2011 with CUDA 3.2 I found that without the explicit cast to
// (int)blockDim.z, idx3 would sometimes be negative. I'm rusty on my signed vs. unsigned
// details, but that seemed really weird. tricky bug to find too.
int idx3 = flipped_kern
? max((kern_len - (int)blockDim.z - first_row),0)
: first_row;
int len3 = min(blockDim.z, kern_len - first_row);
__syncthreads();
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, blockDim.z,
load_to_shared(d_kern, kern+idx3*kern_stride_row, thread_id, nb_thread_id, kern_wid, len3,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
const float* idx_kern=&d_kern[tz*kern_stride_row];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
const float* idx_kern=&d_kern[tz*kern_wid];
const float* idx_in=&d_img[(first_row+tz+out_row)*img_wid+out_col];
float sum2 = 0;
if(row<kern_len)
if(tz<len3)
convolutionRowNoFlip<KERN_WIDTH>(sum2,idx_in,idx_kern,kern_wid);
sum+=sum2;
}
}else if(split){
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
for(int row=tz;row<kern_len;row+=blockDim.z){
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
......@@ -504,18 +518,21 @@ conv_patch_stack_reduce( float* img, float* kern, float* out,
int row = tz;//The row of the kernel.
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid+out_col];
load_to_shared(d_kern, kern, thread_id, nb_thread_id, kern_wid, kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads();
convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
}
__syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
}
//reduce
//reduce no sync because previous loop ends with sync
d_reduce[thread_id]=sum;
__syncthreads();
if(thread_id<out_len*out_wid){
sum=0;
for(int i=0;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*blockDim.x*blockDim.y];
if(thread_id<out_len*out_wid){ // blockDim.x==out_wid, blockDim.y==out_len
//sum=0;
for(int i=1;i<blockDim.z;i++){
sum+=d_reduce[thread_id+i*out_wid*out_len];
}
out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*blockIdx.y+//the output image
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论