enable the modified gpu conv kernel that use less shared memory.

198b22ea · Frederic Bastien · e2122bf4 · 198b22ea · 198b22ea · 198b22ea
--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -363,7 +363,7 @@ class GpuConv(Op):
        return ['cuda_ndarray.cuh','<stdio.h>']

    def c_code_cache_version(self):
-        return (0,12) # raise this whenever modifying any of the support_code_files
+        return (0,13) # raise this whenever modifying any of the support_code_files

    def c_support_code_apply(self, node, nodename):
        # REMEMBER TO RAISE c_code_cache_version when changing any of these files

--- a/theano/sandbox/cuda/conv.cu
+++ b/theano/sandbox/cuda/conv.cu
@@ -116,7 +116,7 @@ CudaNdarray_conv_valid(const CudaNdarray *img, const CudaNdarray * kern,
      if(!subsample &&
 	out_contiguous &&
 	out_size<512 &&//Maximum of 512 theads by block
-	(img_size_byte+2*kern_wid*sizeof(float)+out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
+	 std::max(int(img_size_byte+2*kern_wid*sizeof(float)), out_size_byte*2)<shared_avail && //their is only 16k of shared memory and if we can't have the output at least twice in shared mem, we won't have any reduce!
 	!work_complete)
 	version = 7; //conv_patch_stack_reduce, switch to version 8/13 automatically if needed.
    }

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -422,6 +422,8 @@ def test_valid_5():

 def test_valid_7_8_13():
    shapes = get_valid_shapes()
+    # This is to test the "new" lower shared memory usage.
+    shapes.append(((10,30,60,60),(20,30,40,40), (1,1), (1,1), (1,1))
    version=[7,8,13]
    verbose=0

@@ -437,7 +439,7 @@ def test_valid_7_8_13():
        oshape=[ishape[0]]+[kshape[0]]+list(numpy.asarray(ishape[2:])-numpy.asarray(kshape[2:])+numpy.asarray([1,1]))
        if oshape[2]*oshape[3]>512:
            continue
-        if (numpy.prod(ishape[2:])*4+2*kshape[3]*4+oshape[2]*oshape[3]*4*2)>(16*1024-150):
+        if max(numpy.prod(ishape[2:])*4+2*kshape[3]*4, oshape[2]*oshape[3]*4*2)>(16*1024-150):
            continue
        if subshape==(1,1):
            shapes2.append((ishape, kshape, subshape, istride, kstride))