fix Gpu convolution crash on fermi gpu.

ca383b23 · Frederic Bastien · b55c5864 · ca383b23 · ca383b23
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -336,7 +336,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']
    def c_code_cache_version(self):
-        return (0,5)
+        return (0,6)
    def c_support_code_apply(self, node, nodename):
        return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\

--- a/theano/sandbox/cuda/conv_full_kernel.cu
+++ b/theano/sandbox/cuda/conv_full_kernel.cu
@@ -282,13 +282,17 @@ conv_full_patch_stack_padded( float* img, float* kern, float* out,
 			 thread_id, nb_thread_id, kern_wid,kern_len,
 			 kern_stride_col, kern_stride_row, flipped_kern, c_contiguous);
 	  __syncthreads();
+	  //The if is needed as on Fermi as reading out of bound index from shared memory generate an error.
-	  for (int row=0; row < kern_len; row++) {//loop over row
+	  //Not needed on generation before as they worked anyway. Removing the if generate the good code
-	    const float* idx_kern=&d_kern[row*kern_wid];
+	  //as we store the result of only the good thread.
-	    const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+	  //This was with nvcc 3.0 on an GTX470 card.
+	  if(out_row<out_len)
-	    convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+	    for (int row=0; row < kern_len; row++) {//loop over row
-	  }
+	      const float* idx_kern=&d_kern[row*kern_wid];
+	      const float* idx_in=&d_img[(row+out_row)*img_wid_valid+out_col];
+	      convolutionRowNoFlip<KERN_WIDTH>(sum, idx_kern, idx_in, kern_wid);
+	    }
 	  if(out_row<out_len)
 	    out[batch_id*out_wid*out_len*nkern+//the good batch
 		out_wid*out_len*kern_id+//the output image