提交 04bee10f authored 作者: Frederic Bastien's avatar Frederic Bastien

fix some case when GpuConv fail on GTX470. I still don't understand it completly.

上级 823cffaa
...@@ -191,22 +191,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern, ...@@ -191,22 +191,22 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
int, int); int, int);
#define CONV_PATCH_STACK_SPECIAL(kern_wid) \ #define CONV_PATCH_STACK_SPECIAL(kern_wid) \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;\ if(preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,false,true>;} \
if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;\ else if(preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,false,true>;} \
if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,false,true>;}\
if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;\ else if(preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,false,true>;}\
if(preload_full_kernel && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,true,true>;}\
if(preload_full_kernel && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;\ else if(preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,true,true>;}\
if(preload_full_kernel && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,true,true>;}\
if(preload_full_kernel && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;\ else if(preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,true,true>;}\
if(nb_split==1 && img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,false,false>;}\
if(nb_split==1 && img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;\ else if(!preload_full_kernel && nb_split==1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,false,false>;}\
if(nb_split==1 && !img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,false,false>;}\
if(nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;\ else if(!preload_full_kernel && nb_split==1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,false,false>;}\
if(img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,true,true,false>;} \
if(img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;\ else if(!preload_full_kernel && nb_split!=1 && img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,true,false,true,false>;} \
if(!img_contiguous_2d && kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;\ else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,true,true,false>;} \
if(!img_contiguous_2d && !kern_contiguous_2d) f=conv_patch_stack<true,false,kern_wid,false,false,true,false>; else if(!preload_full_kernel && nb_split!=1 && !img_contiguous_2d && !kern_contiguous_2d){ f=conv_patch_stack<true,false,kern_wid,false,false,true,false>;}
CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID); CONV_PATCH_STACK_SPECIAL(THEANO_KERN_WID);
f<<< grid, threads, shared_size>>> f<<< grid, threads, shared_size>>>
......
...@@ -332,6 +332,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -332,6 +332,7 @@ conv_patch_stack( float* img, float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){ if(!preload_full_kern){
__syncthreads();
int idx2; int idx2;
if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row; if(flipped_kern) idx2=(kern_len-row-1)*kern_stride_row;
else idx2=(row)*kern_stride_row; else idx2=(row)*kern_stride_row;
...@@ -381,6 +382,7 @@ conv_patch_stack( float* img, float* kern, float* out, ...@@ -381,6 +382,7 @@ conv_patch_stack( float* img, float* kern, float* out,
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
if(!preload_full_kern){ if(!preload_full_kern){
__syncthreads();
int idx2=kern_stride_stack*stack; int idx2=kern_stride_stack*stack;
if(flipped_kern) if(flipped_kern)
idx2+=(kern_len-row-1)*kern_stride_row; idx2+=(kern_len-row-1)*kern_stride_row;
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论