special cases for GpuSum

9b3168ae · James Bergstra · 3248be43 · 9b3168ae
--- a/basic_ops.py
+++ b/basic_ops.py
@@ -337,6 +337,26 @@ class GpuDimShuffle(Op):
        return (1,0)
 class GpuSum(Op):
+    """GpuSum is a Reduction along some dimensions by summation.
+    The dimensions along which to sum is specified by the `reduce_mask` that you pass to the
+    constructor.  The `reduce_mask` is a tuple of booleans (actually integers 0 or 1) that
+    specify for each input dimension, whether to reduce it (1) or not (0).
+    For example:
+      - reduce_mask == (1,) sums a vector to a scalar
+      - reduce_mask == (1,0) computes the sum of each column in a matrix
+      - reduce_mask == (0,1) computes the sum of each row in a matrix
+      - reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
+    :note: any reduce_mask of all zeros is a sort of 'copy', and may be removed during graph
+    optimization
+    """
    def __init__(self, reduce_mask):
        self.reduce_mask = tuple(reduce_mask)
@@ -420,9 +440,11 @@ class GpuSum(Op):
        # Now perform the reduction
        #
        if self.reduce_mask == (1,):
-            self.c_code_reduce_1(sio)
+            self.c_code_reduce_1(sio, node, name, x, z, fail)
+        elif self.reduce_mask == (1,1):
+            self.c_code_reduce_11(sio, node, name, x, z, fail)
        elif self.reduce_mask == (1,0):
-            self.c_code_reduce_10(sio)
+            self.c_code_reduce_10(sio, node, name, x, z, fail)
        elif self.reduce_mask == (1,0,1,1):
            self.c_code_reduce_1011(sio, node, name, x, z, fail)
        else:
@@ -430,44 +452,132 @@ class GpuSum(Op):
            assert 0
        return sio.getvalue()
-    def c_code_reduce_1(self, sio):
-        warning('WRITEME')
+    def c_code_reduce_1(self, sio, node, name, x, z, fail):
-        return
+        print >> sio, """
-    def c_code_reduce_10(self, sio):
+        {
-        warning('WRITEME')
+            int verbose = 0;
-        return
+            dim3 n_threads(
-    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+            dim3 n_blocks(1);
+            if (verbose) printf("running kernel_reduce_sum_1_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            kernel_reduce_sum_1_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s));
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
+            {
+                %(fail)s;
+            }
+        }
+        """ %locals()
+    def c_code_reduce_11(self, sio, node, name, x, z, fail):
        print >> sio, """
        {
-            int verbose = 1;
+            int verbose = 0;
-            dim3 n_threads(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+            while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
+            n_threads.y -= 1;
+            if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) 
+                n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; 
+            dim3 n_blocks(1);
+            if (verbose) printf("running kernel_reduce_sum_11_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
+            kernel_reduce_sum_11_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s));
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
+            {
+                %(fail)s;
+            }
+        }
+        """ %locals()
+    def c_code_reduce_10(self, sio, node, name, x, z, fail):
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
            dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
+            if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
+            int n_shared = sizeof(float) * n_threads.x;
+            kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
+                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
+                    CudaNdarray_DEV_DATA(cnda_%(z)s),
+                    CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]
+                    );
+            CNDA_THREAD_SYNC;
+            if (cudaSuccess != cudaGetLastError()) 
+            {
+                %(fail)s;
+            }
+        }
+        """ %locals()
+    def c_code_reduce_1011(self, sio, node, name, x, z, fail):
+        print >> sio, """
+        {
+            int verbose = 0;
+            dim3 n_threads(
+                    std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
+                            NUM_VECTOR_OP_THREADS_PER_BLOCK));
+            while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
+            n_threads.y -= 1;
+            if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]) 
+                n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]; 
            while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
            n_threads.z -= 1;
-            if (n_threads.z > 64) n_threads.z = 64;
+            if (n_threads.z > 64)
-            if (n_threads.z)
+                n_threads.z = 64;
-            {
+            if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]) 
-                if (verbose) printf("trying kernel_reduce_sum_1011\\n");
+                n_threads.z = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0]; 
+            dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
+            if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
+            if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s);
+            if (verbose) fprint_CudaNdarray(stdout, cnda_%(z)s);
            int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
-                kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
+            kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
                    CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
-                        CudaNdarray_DEV_DATA(A),
+                    CudaNdarray_DEV_DATA(cnda_%(x)s),
                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
                    CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
-                        CudaNdarray_DEV_DATA(self),
+                    CudaNdarray_DEV_DATA(cnda_%(z)s),
-                        CudaNdarray_HOST_STRIDES(self)[1]);
+                    CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]);
            CNDA_THREAD_SYNC;
            if (cudaSuccess != cudaGetLastError()) 
            {
                %(fail)s;
            }
        }
-        }
        """ %locals()
    def c_code_cache_version(self):
@@ -475,6 +585,169 @@ class GpuSum(Op):
    def c_support_code_apply(self, node, nodename):
        sio = StringIO.StringIO()
+        if self.reduce_mask == (1,):
+            #this kernel is ok for up to a few thousand elements, but 
+            # it only runs on ONE multiprocessor
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_1_%(nodename)s(
+                    const unsigned int d0,
+                    const float *A, const int sA0,
+                    float * Z)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                {
+                    float Ai = A[i0 * sA0];
+                    mysum += Ai;
+                }
+                buf[threadNum] = mysum;
+                __syncthreads();
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
+                {
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
+                    {
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[0] = buf[0];
+                        }
+                    }
+                }
+            }
+            """ %locals()
+        if self.reduce_mask == (1,1):
+            #this kernel is ok for up to a few thousand elements, but 
+            # it only runs on ONE multiprocessor
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_11_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const float *A, const int sA0, const int sA1,
+                    float * Z)
+            {
+                const int threadCount = blockDim.x * blockDim.y;
+                const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
+                {
+                    for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
+                    {
+                        float Ai = A[i0 * sA0 + i1 * sA1];
+                        mysum += Ai;
+                    }
+                }
+                buf[threadNum] = mysum;
+                __syncthreads();
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
+                {
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
+                    {
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[0] = buf[0];
+                        }
+                    }
+                }
+            }
+            """ %locals()
+        if self.reduce_mask == (1,0):
+            # this kernel uses one block for each column, 
+            # threads per block for each element per column.
+            #TODO: This kernel is pretty inefficient in terms of reading, because if A is
+            #      c_contiguous (typical case) then each warp is accessing non-contigous
+            #      memory (a segment of a column).
+            print >> sio, """
+            static __global__ void kernel_reduce_sum_10_%(nodename)s(
+                    const int d0,
+                    const int d1,
+                    const float *A, const int sA0, const int sA1,
+                    float * Z, const int sZ0)
+            {
+                const int threadCount = blockDim.x;
+                const int threadNum = threadIdx.x;
+                extern __shared__ float buf[];
+                float mysum = 0.0f;
+                if (warpSize != 32)
+                {
+                    return;  //TODO: set error code
+                }
+                for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
+                {
+                    float Ai = A[i0 * sA0 + blockIdx.x * sA1];
+                    mysum += Ai;
+                }
+                buf[threadNum] = mysum;
+                __syncthreads();
+                // rest of function is handled by one warp
+                if (threadNum < warpSize)
+                {
+                    for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
+                    {
+                        mysum += buf[i];
+                    }
+                    buf[threadNum] = mysum;
+                    if (threadNum < 16)
+                    {
+                        //reduce so that threadNum 0 has the sum of everything
+                        if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
+                        if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
+                        if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
+                        if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
+                        if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
+                        if (threadNum == 0)
+                        {
+                            Z[blockIdx.x * sZ0] = buf[0];
+                        }
+                    }
+                }
+            }
+            """ %locals()
+        if self.reduce_mask == (1,0,1,1):
            print >> sio, """
            static __global__ void kernel_reduce_sum_1011_%(nodename)s(
                    const unsigned int d0,
@@ -496,9 +769,15 @@ class GpuSum(Op):
                for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
                {
-                float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
+                    for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
+                    {
+                        for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
+                        {
+                            float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3];
                            mysum += Ai;
                        }
+                    }
+                }
                buf[threadNum] = mysum;
                __syncthreads();