提交 ca383b23 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix Gpu convolution crash on fermi gpu.

上级 b55c5864
...@@ -336,7 +336,7 @@ class GpuConv(Op): ...@@ -336,7 +336,7 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,5) return (0,6)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\ return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
......
...@@ -282,13 +282,17 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -282,13 +282,17 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
//The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
for (int row=0; row < kern_len; row++) {//loop over row //Not needed on generation before as they worked anyway. Removing the if generate the good code
const float* idx_kern=&d_kern[row*kern_wid]; //as we store the result of only the good thread.
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; //This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid); for (int row=0; row < kern_len; row++) {//loop over row
} const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
}
if(out_row<out_len) if(out_row<out_len)
out[batch_id*out_wid*out_len*nkern+//the good batch out[batch_id*out_wid*out_len*nkern+//the good batch
out_wid*out_len*kern_id+//the output image out_wid*out_len*kern_id+//the output image
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论