提交 0d5cffbe authored 作者: Sean Lee's avatar Sean Lee

Force instantiate kernel templates

上级 89f584bc
......@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out,
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void
__device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack,
......@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum;
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
......@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version!
*/
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem >
__global__ void
template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid,
const int kern_len, const int kern_wid,
......@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch
......@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
}
}
if(out_row<out_len)
......@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
}
}
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
......@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{
return x[0] * y[0];
}
template<int NSTACK>
__global__ void
conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid,
......@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
{
int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0)
if (THEANO_KERN_WID > 0)
{
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid,
sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid);
}
else
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论