提交 9e4f9e45 authored 作者: James Bergstra's avatar James Bergstra

gpu conv - template-unrolling depth logarithmic in kernel size rather than linear.

上级 8b9676e3
......@@ -348,12 +348,17 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
template <int i> __device__ float everything_dot(const float * x, const int sx, const float * y, const int sy)
{
return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
return everything_dot<i/2>(x, sx, y, sy) + everything_dot<(i+1)/2>(x+sy*(i/2), sx, y+sy*(i/2), sy) ;
//return x[0] * y[0] + everything_dot<i-1>(x+sx, sx, y+sy, sy);
}
template <> __device__ float everything_dot<0>(const float * x, const int sx, const float * y, const int sy)
{
return 0;
}
template <> __device__ float everything_dot<1>(const float * x, const int sx, const float * y, const int sy)
{
return x[0] * y[0];
}
template<int NSTACK>
__global__ void
conv_full_load_everything( float* img, float* kern, float* out,
......
......@@ -156,9 +156,14 @@ __device__ void load_padded_col_to_shared(float * dst, const float * src,
template<int i> __device__ float convolutionRowNoFlip(const float *data,
const float *kern){
return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
return convolutionRowNoFlip<i/2>(data, kern)+ convolutionRowNoFlip<(i+1)/2>(data+i/2, kern+i/2) ;
//return data[i-1] * kern[i-1] + convolutionRowNoFlip<i - 1>(data,kern);
}
template<> __device__ float convolutionRowNoFlip<1>(const float *data,
const float *kern){
return data[0]*kern[0];
}
template<> __device__ float convolutionRowNoFlip<0>(const float *data,
const float *kern){
return 0;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论