提交 0d5cffbe authored 作者: Sean Lee's avatar Sean Lee

Force instantiate kernel templates

上级 89f584bc
...@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out, ...@@ -122,7 +122,7 @@ conv_full_patch( const float* img, const float* kern, float* out,
//template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions. //template c_contiguous: if true, the img and kern have are column and row contiguous else we use the stride value from the param. The image need to be c_contiguous in the nbatch and nstack dimensions.
template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d> template<bool img_c_contiguous_2d, bool kern_c_contiguous_2d>
__global__ void __device__ inline void
conv_full_patch_stack( const float* img, const float* kern, float* out, conv_full_patch_stack( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
int kern_len, int kern_wid, int nkern, int nstack, int kern_len, int kern_wid, int nkern, int nstack,
...@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -182,6 +182,31 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
out_row*out_wid+out_col] = sum; out_row*out_wid+out_col] = sum;
} }
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK(suffix, ...) \
__global__ void \
conv_full_patch_stack_##suffix( \
const float *img, const float *kern, float *out, \
int img_len, int img_wid, \
int kern_len, int kern_wid, int nkern, int nstack, \
int img_stride_col, int img_stride_row, \
int kern_stride_col, int kern_stride_row, \
int kern_stride_stack, int kern_stride_nkern) \
{ \
conv_full_patch_stack<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK(0, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(1, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK(2, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK(3, true, true)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK
}
/** /**
* As conv_patch_stack, but used for the full convolution by padding the image in shared memory. * As conv_patch_stack, but used for the full convolution by padding the image in shared memory.
* I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency. * I keep it separated from conv_patch as we take 19-20 register which is more than the 10/16 max for each thread and thus this could lower the occupency.
...@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out, ...@@ -200,8 +225,8 @@ conv_full_patch_stack( const float* img, const float* kern, float* out,
* template low_mem: if true, as split but with use less dynamic shared memory but use more registers. * template low_mem: if true, as split but with use less dynamic shared memory but use more registers.
* if you set split and low_mem to true, we will use the low_mem version! * if you set split and low_mem to true, we will use the low_mem version!
*/ */
template<bool flipped_kern, int KERN_WIDTH, bool c_contiguous, bool split, bool low_mem > template<bool flipped_kern, bool c_contiguous, bool split, bool low_mem >
__global__ void __device__ inline void
conv_full_patch_stack_padded( const float* img, const float* kern, float* out, conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const int img_len, const int img_wid, const int img_len, const int img_wid,
const int kern_len, const int kern_wid, const int kern_len, const int kern_wid,
...@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -257,7 +282,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
} }
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -292,7 +317,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
...@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -340,7 +365,7 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row-out_row_iter*nb_rows)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); convolutionRowNoFlip(sum, idx_kern, idx_in, kern_wid);
} }
} }
if(out_row<out_len) if(out_row<out_len)
...@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out, ...@@ -351,6 +376,42 @@ conv_full_patch_stack_padded( const float* img, const float* kern, float* out,
} }
} }
extern "C" {
#define __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(suffix, ...) \
__global__ void \
conv_full_patch_stack_padded_##suffix( \
const float *img, const float *kern, float *out, \
const int img_len, const int img_wid, \
const int kern_len, const int kern_wid, \
const int nkern, const int nstack, \
const int img_stride_col, const int img_stride_row, \
const int img_stride_stack, const int img_stride_batch, \
const int kern_stride_col, const int kern_stride_row, \
const int kern_stride_stack, const int kern_stride_nkern) \
{ \
conv_full_patch_stack_padded<__VA_ARGS__>( \
img, kern, out, img_len, img_wid, kern_len, kern_wid, nkern, nstack, \
img_stride_col, img_stride_row, img_stride_stack, img_stride_batch, \
kern_stride_col, kern_stride_row, \
kern_stride_stack, kern_stride_nkern); \
}
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(0, false, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(1, false, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(2, false, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(4, false, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(5, false, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(6, false, true, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(8, true, false, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(9, true, false, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(10, true, false, true, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(12, true, true, false, false)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(13, true, true, false, true)
__INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED(14, true, true, true, false)
#undef __INSTANTIATE_CONV_FULL_PATCH_STACK_PADDED
}
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy) template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{ {
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ; return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
...@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co ...@@ -364,7 +425,6 @@ template <> __device__ float everything_dot<1>(const float * x, const int sx, co
{ {
return x[0] * y[0]; return x[0] * y[0];
} }
template<int NSTACK>
__global__ void __global__ void
conv_full_load_everything( const float* img, const float* kern, float* out, conv_full_load_everything( const float* img, const float* kern, float* out,
int img_len, int img_wid, int img_len, int img_wid,
...@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out, ...@@ -423,9 +483,9 @@ conv_full_load_everything( const float* img, const float* kern, float* out,
{ {
int icol = out_col - kern_wid+1+col; int icol = out_col - kern_wid+1+col;
if (icol < 0 || icol > img_wid) continue; if (icol < 0 || icol > img_wid) continue;
if (NSTACK > 0) if (THEANO_KERN_WID > 0)
{ {
sum += everything_dot<NSTACK>(d_img + irow*img_wid + icol, img_len*img_wid, sum += everything_dot<THEANO_KERN_WID>(d_img + irow*img_wid + icol, img_len*img_wid,
d_kern + row*kern_wid+col, kern_len*kern_wid); d_kern + row*kern_wid+col, kern_len*kern_wid);
} }
else else
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论