comment change.

f6ecd8bd · Frederic Bastien · 03e07dbc · f6ecd8bd
--- a/theano/sandbox/cuda/conv_kernel.cu
+++ b/theano/sandbox/cuda/conv_kernel.cu
@@ -400,7 +400,7 @@ conv_patch_stack( float* img, float* kern, float* out,
 	    //as we store the result of only the good thread.
 	    //This was with nvcc 3.0 on an GTX470 card.
 	    if(out_row<out_len)
-	      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);//idx_in,idx_in fail, idx_kern, idx_kern, wrong answer!
+	      convolutionRowNoFlip<KERN_WIDTH>(sum,idx_in,idx_kern,kern_wid);
 	  }
 	  __syncthreads(); // ensure calculations have completed before any thread starts changing the shared memory
 	}
@@ -724,6 +724,11 @@ conv_rows_stack2( float* img, float* kern, float* out,

 	}
 	__syncthreads();
+
+	//if needed as on Fermi as reading out of bound index from shared memory generate an error.
+	//Not needed on generation before as they worked anyway. Removing the if generate the good code
+	//as we store the result of only the good thread.
+	//This was with nvcc 3.0 on an GTX470 card.
 	if(out_row<out_len){
 	  const float* idx_kern;
 	  if(preload_full_kern) idx_kern=&d_kern[row*kern_wid];