bump version number following bugfix and remove code duplication.

bab14bae · Frederic Bastien · a59d9a54 · bab14bae
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -924,7 +924,7 @@ class GpuSum(Op):
    def c_code_cache_version(self):
        #return ()
-        return (7,)
+        return (8,)
    def c_support_code_apply(self, node, nodename):
@@ -932,6 +932,7 @@ class GpuSum(Op):
        if self.reduce_mask == (1,):
            #this kernel is ok for up to a few thousand elements, but 
            # it only runs on ONE multiprocessor
+            reducebuf = self._k_reduce_buf('Z[0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_1_%(nodename)s(
                    const unsigned int d0,
@@ -953,31 +954,7 @@ class GpuSum(Op):
                    float Ai = A[i0 * sA0];
                    mysum += Ai;
                }
-                buf[threadNum] = mysum;
+                %(reducebuf)s
-                __syncthreads();
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[0] = buf[0];
-                        }
-                    }
-                }
            }
            """ %locals()
        if self.reduce_mask == (1,1):