simplified code.

a59d9a54 · Frederic Bastien · adb994dd · a59d9a54
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -1019,6 +1019,7 @@ class GpuSum(Op):
            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
            #      c_contiguous (typical case) then each warp is accessing non-contigous
            #      memory (a segment of a column).
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x * sZ0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_10_%(nodename)s(
                    const int d0,
@@ -1041,31 +1042,7 @@ class GpuSum(Op):
                    float Ai = A[i0 * sA0 + blockIdx.x * sA1];
                    mysum += Ai;
                }
-                buf[threadNum] = mysum;
+                %(reducebuf)s
-                __syncthreads();
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[blockIdx.x * sZ0] = buf[0];
-                        }
-                    }
-                }
            }
            """ %locals()
        if self.reduce_mask == (1,1,0):
@@ -1155,6 +1132,7 @@ class GpuSum(Op):
        if self.reduce_mask == (0,0,1):
            # this kernel uses one block for each row, 
            # threads per block for each element per row.
+            reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_001_%(nodename)s(
                    const int d0,
@@ -1181,31 +1159,7 @@ class GpuSum(Op):
                        {
                            mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
                        }
-                        buf[threadNum] = mysum;
+                        %(reducebuf)s
-                        __syncthreads();
-                        // rest of function is handled by one warp
-                        if (threadNum < warpSize)
-                        {
-                            for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                            {
-                                mysum += buf[i];
-                            }
-                            buf[threadNum] = mysum;
-                            if (threadNum < 16)
-                            {
-                                //reduce so that threadNum 0 has the sum of everything
-                                if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                                if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                                if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                                if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                                if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                                if (threadNum == 0)
-                                {
-                                    Z[i0 * sZ0 + i1 * sZ1] = buf[0];
-                                }
-                            }
-                        }
                    }
                }
            }
@@ -1234,6 +1188,7 @@ class GpuSum(Op):
            }
            """ %locals()
        if self.reduce_mask == (1,0,1,1):
+            reducebuf = self._k_reduce_buf('Z[blockIdx.x*sZ0]')
            print >> sio, """
            static __global__ void kernel_reduce_sum_1011_%(nodename)s(
                    const unsigned int d0,
@@ -1264,31 +1219,7 @@ class GpuSum(Op):
                        }
                    }
                }
-                buf[threadNum] = mysum;
+                %(reducebuf)s
-                __syncthreads();
-                // rest of function is handled by one warp
-                if (threadNum < warpSize)
-                {
-                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
-                    {
-                        mysum += buf[i];
-                    }
-                    buf[threadNum] = mysum;
-                    if (threadNum < 16)
-                    {
-                        //reduce so that threadNum 0 has the sum of everything
-                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                        if (threadNum == 0)
-                        {
-                            Z[blockIdx.x*sZ0] = buf[0];
-                        }
-                    }
-                }
            }
            """ %locals()
        return sio.getvalue()