small optimization to GpuSum

4e1e0f8a · Frederic Bastien · 5c4a7c72 · 4e1e0f8a · 4e1e0f8a
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -636,7 +636,20 @@ class GpuSum(Op):
            }
            buf[threadNum] = mysum;
            // no sync because only one warp is running
-            if (threadNum < 16)
+            if(threadCount >32)
+            {
+                buf[threadNum] += buf[threadNum+16];
+                buf[threadNum] += buf[threadNum+8];
+                buf[threadNum] += buf[threadNum+4];
+                buf[threadNum] += buf[threadNum+2];
+                buf[threadNum] += buf[threadNum+1];
+                if (threadNum == 0)
+                {
+                    %(z_pos)s = buf[0];
+                }
+            }
+            else if (threadNum < 16)
            {
                //reduce so that threadNum 0 has the sum of everything
                if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
@@ -979,7 +992,7 @@ class GpuSum(Op):
    def c_code_cache_version(self):
        #return ()
-        return (9,)
+        return (10,)
    def c_support_code_apply(self, node, nodename):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -36,7 +36,7 @@ def test_sum():
    """
    for shape, pattern in [((5,),[0]),
-                           ((5,4),[0,1]),((5,4),[0]),
+                           ((5,4),[0,1]),((33,31),[0,1]),((5,4),[0]),#need something bigger then 32 for some opt test.
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
        a = tensor.TensorType('float32',(False,)*len(shape))()