提交 ca383b23 authored 作者: Frederic Bastien's avatar Frederic Bastien

fix Gpu convolution crash on fermi gpu.

上级 b55c5864
...@@ -336,7 +336,7 @@ class GpuConv(Op): ...@@ -336,7 +336,7 @@ class GpuConv(Op):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh','<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0,5) return (0,6)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\ return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\
......
...@@ -282,7 +282,11 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out, ...@@ -282,7 +282,11 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
thread_id, nb_thread_id, kern_wid,kern_len, thread_id, nb_thread_id, kern_wid,kern_len,
kern_stride_col, kern_stride_row, flipped_kern, c_contiguous); kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
__syncthreads(); __syncthreads();
//The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
//Not needed on generation before as they worked anyway. Removing the if generate the good code
//as we store the result of only the good thread.
//This was with nvcc 3.0 on an GTX470 card.
if(out_row<out_len)
for (int row=0; row < kern_len; row++) {//loop over row for (int row=0; row < kern_len; row++) {//loop over row
const float* idx_kern=&d_kern[row*kern_wid]; const float* idx_kern=&d_kern[row*kern_wid];
const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col]; const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论