提交 b8e1f760 authored 作者: James Bergstra's avatar James Bergstra

GpuSum - fixes and improvements to case 010

上级 16659fd5
...@@ -888,11 +888,70 @@ class GpuSum(Op): ...@@ -888,11 +888,70 @@ class GpuSum(Op):
pattern = ''.join(str(i) for i in self.reduce_mask) pattern = ''.join(str(i) for i in self.reduce_mask)
print >> sio, """ print >> sio, """
{ {
int verbose = 0; //int n_summations = CudaNdarray_HOST_DIMS(%(x)s)[0] * CudaNdarray_HOST_DIMS(%(x)s)[2];
//if ((n_summations >= 15 * 32) && (CudaNdarray_HOST_DIMS(%(x)s)[2]>=16))
if (1) // if the alternative is less buggy, consider not using this branch
{
// If there are a lot of summations to do, then we can use simple parallelization -
// use each thread to do one sum.
// we might as well launch blocks of 32 threads because that's the warp size.
// we could schedule more threads if we were maxing out the gridsize below, but
// the gridsize is way more than the physical hardware and I think 32 threads
// on a huge grid is enough to fully use the hardware.
dim3 n_threads(32,1,1);
// We kindof reshape the input implicitly to something 4D:
// the shape A,B,C -> A, B, D, E
// where C <= D*E < C+32
// where E==32
int A = CudaNdarray_HOST_DIMS(%(x)s)[0];
int B = CudaNdarray_HOST_DIMS(%(x)s)[1];
int C = CudaNdarray_HOST_DIMS(%(x)s)[2];
int D = C/32;
if (32*D < C) D+= 1;
assert ((C <= 32*D) && (32*D < C+32));
// The gridsize would ideally be (A, D). But we do the following logic to make
// sure we don't ask for a grid that is too big.
dim3 n_blocks(A,D);
if (n_blocks.x > NUM_VECTOR_OP_BLOCKS) n_blocks.x = NUM_VECTOR_OP_BLOCKS;
if (n_blocks.x*n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y = NUM_VECTOR_OP_BLOCKS/n_blocks.x;
int n_shared = 0;
kernel_reduce_sum_010_AD_%(name)s<<<n_blocks, n_threads, n_shared>>>(
A,B,C,D,
CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_HOST_STRIDES(%(x)s)[2],
CudaNdarray_DEV_DATA(%(z)s),
CudaNdarray_HOST_STRIDES(%(z)s)[0],
CudaNdarray_HOST_STRIDES(%(z)s)[1]
);
CNDA_THREAD_SYNC;
cudaError_t sts = cudaGetLastError();
if (cudaSuccess != sts)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kernel_reduce_sum_010_%(name)s",
cudaGetErrorString(sts),
n_blocks.x,
n_blocks.y,
n_threads.x,
n_threads.y,
n_threads.z);
%(fail)s;
}
}
else
{
int verbose = 2;
dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2])); dim3 n_threads(std::min(32,CudaNdarray_HOST_DIMS(%(x)s)[2]));
while(n_threads.x*(n_threads.y+1)<=512 while( (n_threads.x*(n_threads.y+1)<=NUM_VECTOR_OP_THREADS_PER_BLOCK)
&& n_threads.y<CudaNdarray_HOST_DIMS(%(x)s)[1]){ && (n_threads.y<CudaNdarray_HOST_DIMS(%(x)s)[1])){
n_threads.y++; n_threads.y++;
} }
...@@ -939,6 +998,7 @@ class GpuSum(Op): ...@@ -939,6 +998,7 @@ class GpuSum(Op):
%(fail)s; %(fail)s;
} }
} }
}
""" %locals() """ %locals()
def c_code_reduce_0101(self, sio, node, name, x, z, fail): def c_code_reduce_0101(self, sio, node, name, x, z, fail):
...@@ -1145,7 +1205,8 @@ class GpuSum(Op): ...@@ -1145,7 +1205,8 @@ class GpuSum(Op):
""" %locals() """ %locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (17,) #return ()
return (18,)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
...@@ -1295,7 +1356,6 @@ class GpuSum(Op): ...@@ -1295,7 +1356,6 @@ class GpuSum(Op):
const int threadCount = blockDim.x; const int threadCount = blockDim.x;
const int threadNum = threadIdx.x; const int threadNum = threadIdx.x;
extern __shared__ float buf[]; extern __shared__ float buf[];
float mysum = 0.0f;
if (warpSize != 32) if (warpSize != 32)
{ {
...@@ -1307,6 +1367,7 @@ class GpuSum(Op): ...@@ -1307,6 +1367,7 @@ class GpuSum(Op):
{ {
for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y) for (int i2 = blockIdx.y; i2 < d2; i2 += gridDim.y)
{ {
float mysum = 0.0f;
for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x) for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x)
{ {
mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2]; mysum += A[i0 * sA0 + i1 * sA1 + i2 * sA2];
...@@ -1318,10 +1379,51 @@ class GpuSum(Op): ...@@ -1318,10 +1379,51 @@ class GpuSum(Op):
} }
""" %locals() """ %locals()
if self.reduce_mask == (0,1,0): if self.reduce_mask == (0,1,0):
print >> sio, """
static __global__ void kernel_reduce_sum_010_AD_%(nodename)s(
const int A,
const int B,
const int C,
const int D,
//const int E, // THIS is 32
const float *X, const int sX0, const int sX1, const int sX2,
float * Z, const int sZ0, const int sZ1)
{
const int threadCount = blockDim.x;
const int threadNum = threadIdx.x;
float mysum = 0.0f;
if (warpSize != 32)
{
return; //TODO: set error code
}
for (int a = blockIdx.x; a < A; a += gridDim.x)
{
for (int i2_D = blockIdx.y; i2_D < D; i2_D += gridDim.y)
{
int c = i2_D * 32 + threadIdx.x;
if (c < C)
{
mysum = 0;
for (int b = 0; b < B; ++b)
{
mysum += X[a * sX0 + b * sX1 + c * sX2];
}
Z[a * sZ0 + c * sZ1] = mysum;
}
}
}
}
""" %locals()
if self.reduce_mask == (0,1,0):
#
# This kernel is optimized when the inner most dimensions have the smallest stride. # This kernel is optimized when the inner most dimensions have the smallest stride.
# this kernel uses one block for multiple column(up to 32TODO), # this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column. # threads per block for each element per column.
#thread.x = dim 2 contiguous #thread.x = dim 2 contiguous
#thread.y = dim 1 #thread.y = dim 1
#block.x = dim 0 #block.x = dim 0
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论