special cases for GpuSum

9b3168ae · James Bergstra · 3248be43 · 9b3168ae
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -337,6 +337,26 @@ class GpuDimShuffle(Op):
        return (1,0)

 class GpuSum(Op):
+    """GpuSum is a Reduction along some dimensions by summation.
+
+    The dimensions along which to sum is specified by the `reduce_mask` that you pass to the
+    constructor.  The `reduce_mask` is a tuple of booleans (actually integers 0 or 1) that
+    specify for each input dimension, whether to reduce it (1) or not (0).
+
+    For example:
+    
+      - reduce_mask == (1,) sums a vector to a scalar
+
+      - reduce_mask == (1,0) computes the sum of each column in a matrix
+
+      - reduce_mask == (0,1) computes the sum of each row in a matrix
+
+      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
+
+    :note: any reduce_mask of all zeros is a sort of 'copy', and may be removed during graph
+    optimization
+
+    """
    def __init__(self, reduce_mask):
        self.reduce_mask = tuple(reduce_mask)

@@ -420,9 +440,11 @@ class GpuSum(Op):
        # Now perform the reduction
        #
        if self.reduce_mask == (1,):
-            self.c_code_reduce_1(sio)
+            self.c_code_reduce_1(sio, node, name, x, z, fail)
+        elif self.reduce_mask == (1,1):
+            self.c_code_reduce_11(sio, node, name, x, z, fail)
        elif self.reduce_mask == (1,0):
-            self.c_code_reduce_10(sio)
+            self.c_code_reduce_10(sio, node, name, x, z, fail)
        elif self.reduce_mask == (1,0,1,1):
            self.c_code_reduce_1011(sio, node, name, x, z, fail)
        else:
@@ -430,42 +452,130 @@ class GpuSum(Op):
            assert 0

        return sio.getvalue()
-    def c_code_reduce_1(self, sio):
-        warning('WRITEME')
-        return
-    def c_code_reduce_10(self, sio):
-        warning('WRITEME')
-        return
-    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+
+    def c_code_reduce_1(self, sio, node, name, x, z, fail):
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+            dim3 n_blocks(1);
+            if (verbose) printf("running kernel_reduce_sum_1_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            kernel_reduce_sum_1_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s));
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
+            {
+                %(fail)s;
+            }
+        }
+        """ %locals()
+
+    def c_code_reduce_11(self, sio, node, name, x, z, fail):
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+            while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
+            n_threads.y -= 1;
+            if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) 
+                n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; 
+
+            dim3 n_blocks(1);
+            if (verbose) printf("running kernel_reduce_sum_11_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            kernel_reduce_sum_11_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s));
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
+            {
+                %(fail)s;
+            }
+        }
+        """ %locals()
+
+
+    def c_code_reduce_10(self, sio, node, name, x, z, fail):
        print >> sio, """
        {
-            int verbose = 1;
-            dim3 n_threads(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
            dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
+            if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x;
+            kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]
+                    );
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
+            {
+                %(fail)s;
+            }
+        }
+        """ %locals()
+
+    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+
+            while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
+            n_threads.y -= 1;
+            if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]) 
+                n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]; 
+
            while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
            n_threads.z -= 1;
-            if (n_threads.z > 64) n_threads.z = 64;
-            if (n_threads.z)
+            if (n_threads.z > 64)
+                n_threads.z = 64;
+            if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) 
+                n_threads.z = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; 
+            
+            dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
+
+            if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
+            if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s);
+            if (verbose) fprint_CudaNdarray(stdout, cnda_%(z)s);
+            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]);
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
            {
-                if (verbose) printf("trying kernel_reduce_sum_1011\\n");
-                int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
-                kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
-                        CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
-                        CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
-                        CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
-                        CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
-                        CudaNdarray_DEV_DATA(A),
-                        CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
-                        CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
-                        CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
-                        CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
-                        CudaNdarray_DEV_DATA(self),
-                        CudaNdarray_HOST_STRIDES(self)[1]);
-                CNDA_THREAD_SYNC;
-                if (cudaSuccess != cudaGetLastError()) 
-                {
-                    %(fail)s;
-                }
+                %(fail)s;
            }
        }
        """ %locals()
@@ -475,57 +585,226 @@ class GpuSum(Op):

    def c_support_code_apply(self, node, nodename):
        sio = StringIO.StringIO()
-        print >> sio, """
-        static __global__ void kernel_reduce_sum_1011_%(nodename)s(
-                const unsigned int d0,
-                const unsigned int d1,
-                const unsigned int d2,
-                const unsigned int d3,
-                const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
-                float * Z, const int sZ0)
-        {
-            const int threadCount = blockDim.x * blockDim.y * blockDim.z;
-            const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
-            extern __shared__ float buf[];
-            float mysum = 0.0f;
+        if self.reduce_mask == (1,):
+            #this kernel is ok for up to a few thousand elements, but 
+            # it only runs on ONE multiprocessor
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_1_%(nodename)s(
+                    const unsigned int d0,
+                    const float *A, const int sA0,
+                    float * Z)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;
+
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+
+                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                {
+                    float Ai = A[i0 * sA0];
+                    mysum += Ai;
+                }
+                buf[threadNum] = mysum;
+                __syncthreads();

-            if (warpSize != 32)
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
+                {
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
+                    {
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[0] = buf[0];
+                        }
+                    }
+                }
+            }
+            """ %locals()
+        if self.reduce_mask == (1,1):
+            #this kernel is ok for up to a few thousand elements, but 
+            # it only runs on ONE multiprocessor
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_11_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const float *A, const int sA0, const int sA1,
+                    float * Z)
            {
-                return;  //TODO: set error code
+                const int threadCount = blockDim.x * blockDim.y;
+                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;
+
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+
+                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
+                {
+                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
+                    {
+                        float Ai = A[i0 * sA0 + i1 * sA1];
+                        mysum += Ai;
+                    }
+                }
+                buf[threadNum] = mysum;
+                __syncthreads();
+
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
+                {
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
+                    {
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[0] = buf[0];
+                        }
+                    }
+                }
            }
+            """ %locals()
+        if self.reduce_mask == (1,0):
+            # this kernel uses one block for each column, 
+            # threads per block for each element per column.

-            for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
+            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
+            #      c_contiguous (typical case) then each warp is accessing non-contigous
+            #      memory (a segment of a column).
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_10_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const float *A, const int sA0, const int sA1,
+                    float * Z, const int sZ0)
            {
-                float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
-                mysum += Ai;
-            }
-            buf[threadNum] = mysum;
-            __syncthreads();
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;

-            // rest of function is handled by one warp
-            if (threadNum < warpSize)
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+
+                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                {
+                    float Ai = A[i0 * sA0 + blockIdx.x * sA1];
+                    mysum += Ai;
+                }
+                buf[threadNum] = mysum;
+                __syncthreads();
+
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
+                {
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
+                    {
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[blockIdx.x * sZ0] = buf[0];
+                        }
+                    }
+                }
+            }
+            """ %locals()
+        if self.reduce_mask == (1,0,1,1):
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_1011_%(nodename)s(
+                    const unsigned int d0,
+                    const unsigned int d1,
+                    const unsigned int d2,
+                    const unsigned int d3,
+                    const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
+                    float * Z, const int sZ0)
            {
-                for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                const int threadCount = blockDim.x * blockDim.y * blockDim.z;
+                const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;
+
+                if (warpSize != 32)
                {
-                    mysum += buf[i];
+                    return;  //TODO: set error code
+                }
+
+                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
+                {
+                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
+                        {
+                            float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3];
+                            mysum += Ai;
+                        }
+                    }
                }
                buf[threadNum] = mysum;
-                if (threadNum < 16)
+                __syncthreads();
+
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
                {
-                    //reduce so that threadNum 0 has the sum of everything
-                    if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
-                    if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
-                    if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
-                    if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
-                    if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
-                    if (threadNum == 0)
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
                    {
-                        Z[blockIdx.x*sZ0] = buf[0];
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[blockIdx.x*sZ0] = buf[0];
+                        }
                    }
                }
            }
-        }
-        """ %locals()
+            """ %locals()
        return sio.getvalue()

 class GpuReshape(tensor.Reshape):