fix GpuSum pattern 01,011 and 0111 when the outer dimensions is bigger then 4096.

609593b4 · Frederic Bastien · 791b8bcf · 609593b4 · 609593b4
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1062,7 +1062,7 @@ class GpuSum(Op):
        """ %locals()
    def c_code_cache_version(self):
-        return (15,)
+        return (16,)
    def c_support_code_apply(self, node, nodename):
@@ -1174,7 +1174,7 @@ class GpuSum(Op):
                for_i2 = "for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)"
                for_i3 = "for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)"
-            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0]')
            param_dim = ",".join(["const int d%(i)s"%locals() for i in range(nd_in)])
            param_strides = ",".join(["const int sA%(i)s"%locals() for i in range(nd_in)])
            decl = self._k_decl(node,nodename)
@@ -1182,15 +1182,18 @@ class GpuSum(Op):
            print >> sio, """
            %(decl)s{
                %(init)s
-                %(for_i1)s{
+                for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
-                  %(for_i2)s{
+                  mysum = 0;
-                    %(for_i3)s{
+                  %(for_i1)s{
-                      float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + blockIdx.x * sA0];
+                    %(for_i2)s{
-                      mysum += Ai;
+                      %(for_i3)s{
+                        float Ai = A[i3 * sA3 + i2 * sA2 + i1 * sA1 + i0 * sA0];
+                        mysum += Ai;
+                      }
                    }
                  }
+                  %(reducebuf)s
                }
-                %(reducebuf)s
            }
            """ %locals()
        if self.reduce_mask == (1,0):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -38,7 +38,27 @@ def test_sum():
                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
                           ((0,0,0,0),[0,1,2,3]),
-                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]:
+                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
+                           #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
+                           ((4100,3),[0]),((3,4101),[0]),#10
+                           ((4100,3),[1]),((3,4101),[1]),#01
+                           ((4100,3),[0,1]),((3,4101),[0,1]),#11
+                           ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
+                           #((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010 ##not implemented
+                           ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
+                           ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
+                           ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
+                           #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
+                           ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111
+                           ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
+                           ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
+                           ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
+                           ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111
+                           ]:
        a = tensor.TensorType('float32',(False,)*len(shape))()
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)