提交 9b3168ae authored 作者: James Bergstra's avatar James Bergstra

special cases for GpuSum

上级 3248be43
......@@ -337,6 +337,26 @@ class GpuDimShuffle(Op):
return (1,0)
class GpuSum(Op):
"""GpuSum is a Reduction along some dimensions by summation.
The dimensions along which to sum is specified by the `reduce_mask` that you pass to the
constructor. The `reduce_mask` is a tuple of booleans (actually integers 0 or 1) that
specify for each input dimension, whether to reduce it (1) or not (0).
For example:
- reduce_mask == (1,) sums a vector to a scalar
- reduce_mask == (1,0) computes the sum of each column in a matrix
- reduce_mask == (0,1) computes the sum of each row in a matrix
- reduce_mask == (1,1,1) computes the sum of all elements in a 3-tensor.
:note: any reduce_mask of all zeros is a sort of 'copy', and may be removed during graph
optimization
"""
def __init__(self, reduce_mask):
self.reduce_mask = tuple(reduce_mask)
......@@ -420,9 +440,11 @@ class GpuSum(Op):
# Now perform the reduction
#
if self.reduce_mask == (1,):
self.c_code_reduce_1(sio)
self.c_code_reduce_1(sio, node, name, x, z, fail)
elif self.reduce_mask == (1,1):
self.c_code_reduce_11(sio, node, name, x, z, fail)
elif self.reduce_mask == (1,0):
self.c_code_reduce_10(sio)
self.c_code_reduce_10(sio, node, name, x, z, fail)
elif self.reduce_mask == (1,0,1,1):
self.c_code_reduce_1011(sio, node, name, x, z, fail)
else:
......@@ -430,42 +452,130 @@ class GpuSum(Op):
assert 0
return sio.getvalue()
def c_code_reduce_1(self, sio):
warning('WRITEME')
return
def c_code_reduce_10(self, sio):
warning('WRITEME')
return
def c_code_reduce_1011(self, sio, node, name, x, z, fail):
def c_code_reduce_1(self, sio, node, name, x, z, fail):
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_1_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
{
%(fail)s;
}
}
""" %locals()
def c_code_reduce_11(self, sio, node, name, x, z, fail):
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dim3 n_blocks(1);
if (verbose) printf("running kernel_reduce_sum_11_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_11_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s));
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
{
%(fail)s;
}
}
""" %locals()
def c_code_reduce_10(self, sio, node, name, x, z, fail):
print >> sio, """
{
int verbose = 1;
dim3 n_threads(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3], CudaNdarray_HOST_DIMS(cnda_%(x)s)[2]);
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_10_%(name)s\\n");
int n_shared = sizeof(float) * n_threads.x;
kernel_reduce_sum_10_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]
);
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
{
%(fail)s;
}
}
""" %locals()
def c_code_reduce_1011(self, sio, node, name, x, z, fail):
print >> sio, """
{
int verbose = 0;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
NUM_VECTOR_OP_THREADS_PER_BLOCK));
while (n_threads.y * n_threads.x < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.y;
n_threads.y -= 1;
if (n_threads.y > CudaNdarray_HOST_DIMS(cnda_%(x)s)[2])
n_threads.y = CudaNdarray_HOST_DIMS(cnda_%(x)s)[2];
while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
n_threads.z -= 1;
if (n_threads.z > 64) n_threads.z = 64;
if (n_threads.z)
if (n_threads.z > 64)
n_threads.z = 64;
if (n_threads.z > CudaNdarray_HOST_DIMS(cnda_%(x)s)[0])
n_threads.z = CudaNdarray_HOST_DIMS(cnda_%(x)s)[0];
dim3 n_blocks(CudaNdarray_HOST_DIMS(cnda_%(x)s)[1]);
if (verbose) printf("running kernel_reduce_sum_1011_%(name)s\\n");
if (verbose) fprint_CudaNdarray(stdout, cnda_%(x)s);
if (verbose) fprint_CudaNdarray(stdout, cnda_%(z)s);
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1011_%(name)s<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(x)s),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(cnda_%(z)s),
CudaNdarray_HOST_STRIDES(cnda_%(z)s)[0]);
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
{
if (verbose) printf("trying kernel_reduce_sum_1011\\n");
int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
CudaNdarray_HOST_DIMS(cnda_%(x)s)[0],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[1],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[2],
CudaNdarray_HOST_DIMS(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(A),
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[0],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[1],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[2],
CudaNdarray_HOST_STRIDES(cnda_%(x)s)[3],
CudaNdarray_DEV_DATA(self),
CudaNdarray_HOST_STRIDES(self)[1]);
CNDA_THREAD_SYNC;
if (cudaSuccess != cudaGetLastError())
{
%(fail)s;
}
%(fail)s;
}
}
""" %locals()
......@@ -475,57 +585,226 @@ class GpuSum(Op):
def c_support_code_apply(self, node, nodename):
sio = StringIO.StringIO()
print >> sio, """
static __global__ void kernel_reduce_sum_1011_%(nodename)s(
const unsigned int d0,
const unsigned int d1,
const unsigned int d2,
const unsigned int d3,
const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
float * Z, const int sZ0)
{
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
if self.reduce_mask == (1,):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
print >> sio, """
static __global__ void kernel_reduce_sum_1_%(nodename)s(
const unsigned int d0,
const float *A, const int sA0,
float * Z)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
float Ai = A[i0 * sA0];
mysum += Ai;
}
buf[threadNum] = mysum;
__syncthreads();
if (warpSize != 32)
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
}
""" %locals()
if self.reduce_mask == (1,1):
#this kernel is ok for up to a few thousand elements, but
# it only runs on ONE multiprocessor
print >> sio, """
static __global__ void kernel_reduce_sum_11_%(nodename)s(
const int d0,
const int d1,
const float *A, const int sA0, const int sA1,
float * Z)
{
return; //TODO: set error code
const int threadCount = blockDim.x * blockDim.y;
const int threadNum = threadIdx.y*blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.y; i0 < d0; i0 += blockDim.y)
{
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{
float Ai = A[i0 * sA0 + i1 * sA1];
mysum += Ai;
}
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[0] = buf[0];
}
}
}
}
""" %locals()
if self.reduce_mask == (1,0):
# this kernel uses one block for each column,
# threads per block for each element per column.
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
#TODO: This kernel is pretty inefficient in terms of reading, because if A is
# c_contiguous (typical case) then each warp is accessing non-contigous
# memory (a segment of a column).
print >> sio, """
static __global__ void kernel_reduce_sum_10_%(nodename)s(
const int d0,
const int d1,
const float *A, const int sA0, const int sA1,
float * Z, const int sZ0)
{
float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
mysum += Ai;
}
buf[threadNum] = mysum;
__syncthreads();
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
// rest of function is handled by one warp
if (threadNum < warpSize)
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int i0 = threadIdx.x; i0 < d0; i0 += blockDim.x)
{
float Ai = A[i0 * sA0 + blockIdx.x * sA1];
mysum += Ai;
}
buf[threadNum] = mysum;
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[blockIdx.x * sZ0] = buf[0];
}
}
}
}
""" %locals()
if self.reduce_mask == (1,0,1,1):
print >> sio, """
static __global__ void kernel_reduce_sum_1011_%(nodename)s(
const unsigned int d0,
const unsigned int d1,
const unsigned int d2,
const unsigned int d3,
const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
float * Z, const int sZ0)
{
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
const int threadCount = blockDim.x * blockDim.y * blockDim.z;
const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
extern __shared__ float buf[];
float mysum = 0.0f;
if (warpSize != 32)
{
mysum += buf[i];
return; //TODO: set error code
}
for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
{
for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y)
{
for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x)
{
float Ai = A[i0 * sA0 + blockIdx.x * sA1 + i2 * sA2 + i3 * sA3];
mysum += Ai;
}
}
}
buf[threadNum] = mysum;
if (threadNum < 16)
__syncthreads();
// rest of function is handled by one warp
if (threadNum < warpSize)
{
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
{
mysum += buf[i];
}
buf[threadNum] = mysum;
if (threadNum < 16)
{
Z[blockIdx.x*sZ0] = buf[0];
//reduce so that threadNum 0 has the sum of everything
if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
if (threadNum == 0)
{
Z[blockIdx.x*sZ0] = buf[0];
}
}
}
}
}
""" %locals()
""" %locals()
return sio.getvalue()
class GpuReshape(tensor.Reshape):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论