提交 5e69ec44 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Limit the total size of blocks to 512 and the size of the grids to 65535.

This should help older GPUs run at all and newer GPUs fit more blocks on one SM. With this change the code is cc 2.0+ compatible. But it will only be fast on cc 3.0+ cards (due to atomicAdd).
上级 b4b6a31e
...@@ -85,19 +85,22 @@ __global__ void _sgemvBH_N_a1_b1_small(const float *A[], int lda, ...@@ -85,19 +85,22 @@ __global__ void _sgemvBH_N_a1_b1_small(const float *A[], int lda,
const float *x[], int incx, const float *x[], int incx,
float *y[], int incy, float *y[], int incy,
int b, int m, int n) { int b, int m, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x; for (int p = blockIdx.y * blockDim.y + threadIdx.y; p < b;
int p = blockIdx.y * blockDim.y + threadIdx.y; p += gridDim.y * blockDim.y) {
if (i >= m || p >= b) return; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < m;
float yi = 0.0f; i += gridDim.x * blockDim.x) {
const float *Ap = A[p] + i; float yi = 0.0f;
const float *xp = x[p]; const float *Ap = A[p] + i;
# pragma unroll 32 const float *xp = x[p];
for (int j = 0; j < n; j++) { #pragma unroll 32
yi += Ap[0] * xp[0]; for (int j = 0; j < n; j++) {
Ap += lda; yi += Ap[0] * xp[0];
xp += incx; Ap += lda;
xp += incx;
}
atomicAdd(&y[p][i*incy], yi);
}
} }
atomicAdd(&y[p][i*incy], yi);
} }
__global__ void _sgemvBH_T_a1_b1_small(const float *A[], int lda, __global__ void _sgemvBH_T_a1_b1_small(const float *A[], int lda,
...@@ -130,13 +133,20 @@ static cublasStatus_t SgemvBatched(cublasHandle_t handle, ...@@ -130,13 +133,20 @@ static cublasStatus_t SgemvBatched(cublasHandle_t handle,
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
cublasPointerMode_t mode; cublasPointerMode_t mode;
cudaError_t err; cudaError_t err;
if (block.x > 32) { if (m < 512) {
grid.x = (block.x + 31)/32;
block.x = 32; block.x = 32;
if (batchCount > 16)
block.y = 16;
else
block.y = batchCount;
} else {
block.x = 512;
block.y = 1;
} }
if (block.y > 32) { grid.x = (m + block.x - 1) / block.x;
grid.y = (block.y + 31)/32; grid.y = (batchCount + block.y - 1) / block.y;
block.y = 32; if (grid.x * grid.y > 65535) {
grid.y = (65535 / grid.x);
} }
cublasGetPointerMode(handle, &mode); cublasGetPointerMode(handle, &mode);
if (mode != CUBLAS_POINTER_MODE_HOST) if (mode != CUBLAS_POINTER_MODE_HOST)
...@@ -255,12 +265,12 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) { ...@@ -255,12 +265,12 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
int n = block.x*block.y*grid.z; int n = block.x*block.y*grid.z;
if (block.x > 32) { if (block.x > 32) {
grid.x = (block.x + 31)/32; grid.x = (block.x + 31) / 32;
block.x = 32; block.x = 32;
} }
if (block.y > 32) { if (block.x * block.y > 512) {
grid.y = (block.y + 31)/32; grid.y = (block.y + 15) / 16;
block.y = 32; block.y = 16;
} }
SparseBlockGemv_fill_lists<<<grid, block>>>( SparseBlockGemv_fill_lists<<<grid, block>>>(
n, n,
...@@ -307,7 +317,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -307,7 +317,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
W=W, fail=sub['fail'], name=nodename) W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self): def c_code_cache_version(self):
return (8,) return (9,)
def grad(self, inputs, grads): def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs o, W, h, inputIdx, outputIdx = inputs
...@@ -401,10 +411,11 @@ __global__ void _sgerBH_gen_small(const float *x[], int incx, ...@@ -401,10 +411,11 @@ __global__ void _sgerBH_gen_small(const float *x[], int incx,
int b, int m, int n) { int b, int m, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y; int j = blockIdx.y * blockDim.y + threadIdx.y;
int p = blockIdx.z;
if (i > m || j > n) return; if (i > m || j > n) return;
atomicAdd(&A[p][j * lda + i], for (int p = blockIdx.z; p < b; p += gridDim.z) {
alpha * x[p][i * incx] * y[p][j * incy]); atomicAdd(&A[p][j * lda + i],
alpha * x[p][i * incx] * y[p][j * incy]);
}
} }
static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n, static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
...@@ -417,13 +428,29 @@ static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n, ...@@ -417,13 +428,29 @@ static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
dim3 grid(1, 1, batchCount); dim3 grid(1, 1, batchCount);
cublasPointerMode_t mode; cublasPointerMode_t mode;
cudaError_t err; cudaError_t err;
if (block.x > 32) { if (incx == 1) {
grid.x = (block.x + 31)/32; if (block.x > 32) {
block.x = 32; grid.x = (block.x + 31)/32;
block.x = 32;
}
if (block.x * block.y > 512) {
grid.y = (block.y + 15) / 16;
block.y = 16;
}
} else {
if (block.y > 32) {
grid.y = (block.y + 31)/32;
block.y = 32;
}
if (block.x * block.y > 512) {
grid.x = (block.x + 15) / 16;
block.x = 16;
}
} }
if (block.y > 32) { if (grid.x * grid.y * grid.z > 65535) {
grid.y = (block.y + 31)/32; // If grid.x * grid.y is bigger than 65535 you deserve the error
block.y = 32; // you'll get later because that is way too big for this op.
grid.z = (65535 / (grid.x * grid.y));
} }
cublasGetPointerMode(handle, &mode); cublasGetPointerMode(handle, &mode);
if (mode == CUBLAS_POINTER_MODE_HOST) { if (mode == CUBLAS_POINTER_MODE_HOST) {
...@@ -533,12 +560,12 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1) ...@@ -533,12 +560,12 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
grid.z = CudaNdarray_HOST_DIMS(%(x)s)[0]; grid.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
int n = block.x * block.y * grid.z; int n = block.x * block.y * grid.z;
if (block.x > 32) { if (block.x > 32) {
grid.x = (block.x + 31)/32; grid.x = (block.x + 31) / 32;
block.x = 32; block.x = 32;
} }
if (block.y > 32) { if (block.x * block.y > 512) {
grid.y = (block.y + 31)/32; grid.y = (block.y + 15) / 16;
block.y = 32; block.y = 16;
} }
SparseBlockOuter_fill_lists<<<grid, block>>>( SparseBlockOuter_fill_lists<<<grid, block>>>(
n, n,
...@@ -576,7 +603,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -576,7 +603,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
alpha=alpha, fail=sub['fail']) alpha=alpha, fail=sub['fail'])
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
sparse_block_outer_ss = SparseBlockOuterSS(False) sparse_block_outer_ss = SparseBlockOuterSS(False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论