reenable the automatic detection of when to don't load the full kernel for the…

reenable the automatic detection of when to don't load the full kernel for the gpu conv conv_patch_stack_reduce. This is just a speed up reenabled.

reenable the automatic detection of when to don't load the full kernel for the…
c1463e24 · Frederic Bastien · fcc96a54 · c1463e24 · c1463e24
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,7 +363,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']
    def c_code_cache_version(self):
-        return (0,10) # raise this whenever modifying any of the support_code_files
+        return (0,11) # raise this whenever modifying any of the support_code_files
    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of these files

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -449,6 +449,11 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
 	if(version==8||version==13) nb_split++;//force the split.
 	if(version==13)full_kern=false;
+	//check if we can fit the full kernel in the shared memory
+	if(sizeof(float)*std::max(img_size + kern_size, out_size*2) > shared_avail){
+	  full_kern = false;
+	}
        //thread_z is going to be ceil_intdiv(kern_len, nb_split)
        // we need enough splits so that
        // a) thread_z fits in the 'z' threadIdx (i.e. is less than 64)