提交 496cb1c7 authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix the stupid scheduling for better performance (should be much faster).

Also address some other issues that came up in code review.
上级 5e9c7bce
import numpy import numpy
import theano import theano
from theano import Apply, tensor, scalar, Constant from theano import Apply, tensor, scalar, Constant
from theano.tensor import DimShuffle from theano.tensor import DimShuffle, discrete_dtypes
from theano.gradient import grad_undefined, grad_not_implemented from theano.gradient import grad_undefined, grad_not_implemented
...@@ -14,6 +14,14 @@ if cuda_available: ...@@ -14,6 +14,14 @@ if cuda_available:
GpuDimShuffle) GpuDimShuffle)
class SparseBlockGemvSS(GpuOp): class SparseBlockGemvSS(GpuOp):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors.
This should not be directly called since the interface is subject
to change without notice. Use the sparse_block_dot_SS() function
for a stable interface.
"""
def __init__(self, inplace=False): def __init__(self, inplace=False):
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
...@@ -38,8 +46,8 @@ class SparseBlockGemvSS(GpuOp): ...@@ -38,8 +46,8 @@ class SparseBlockGemvSS(GpuOp):
assert inputIdx.ndim == 2 assert inputIdx.ndim == 2
assert outputIdx.ndim == 2 assert outputIdx.ndim == 2
assert 'int' in inputIdx.type.dtype assert inputIdx.type.dtype in discrete_dtypes
assert 'int' in outputIdx.type.dtype assert outputIdx.type.dtype in discrete_dtypes
return Apply(self, [o, W, h, inputIdx, outputIdx], return Apply(self, [o, W, h, inputIdx, outputIdx],
[o.type()]) [o.type()])
...@@ -75,7 +83,7 @@ const npy_intp *oIdx, int oI_str_0 ...@@ -75,7 +83,7 @@ const npy_intp *oIdx, int oI_str_0
__global__ void __global__ void
SparseBlockGemv_reduce( SparseBlockGemv_reduce(
int red_dim, int red_dim, int m, int n,
float *outB, int i_str_0, int i_str_1, int i_str_2, int i_str_3, float *outB, int i_str_0, int i_str_1, int i_str_2, int i_str_3,
float *out, int o_str_0, int o_str_1, int o_str_2 float *out, int o_str_0, int o_str_1, int o_str_2
) { ) {
...@@ -83,6 +91,7 @@ float *out, int o_str_0, int o_str_1, int o_str_2 ...@@ -83,6 +91,7 @@ float *out, int o_str_0, int o_str_1, int o_str_2
int j = threadIdx.y + blockDim.y * blockIdx.y; int j = threadIdx.y + blockDim.y * blockIdx.y;
int b = threadIdx.z + blockDim.z * blockIdx.z; int b = threadIdx.z + blockDim.z * blockIdx.z;
float s = 0.0; float s = 0.0;
if (i > m || j > n) return;
float *oB = &outB[b * i_str_0 + i * i_str_2 + j * i_str_3]; float *oB = &outB[b * i_str_0 + i * i_str_2 + j * i_str_3];
for (int k = 0; k < red_dim; k++) { for (int k = 0; k < red_dim; k++) {
s += oB[k * i_str_1]; s += oB[k * i_str_1];
...@@ -94,8 +103,8 @@ float *out, int o_str_0, int o_str_1, int o_str_2 ...@@ -94,8 +103,8 @@ float *out, int o_str_0, int o_str_1, int o_str_2
cudaError_t err; cudaError_t err;
PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP); PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
if (aa == NULL) { return -1; } if (aa == NULL) { return -1; }
err = cudaMemcpy(b, PyArray_DATA(aa), PyArray_NBYTES(aa), err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
Py_DECREF(aa); Py_DECREF(aa);
if (err != cudaSuccess) { if (err != cudaSuccess) {
PyErr_SetString(PyExc_RuntimeError, "Cannot copy index data to GPU"); PyErr_SetString(PyExc_RuntimeError, "Cannot copy index data to GPU");
...@@ -186,13 +195,22 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) { ...@@ -186,13 +195,22 @@ if (CudaNdarray_CopyFromCudaNdarray(%(out)s, %(o)s)) {
if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1) if (SparseBlockGemv_copy(%(outputIdx)s, %(name)s_oIdx) == -1)
{ %(fail)s } { %(fail)s }
{ /* Prepare lists for the batch */ { /* Prepare lists for the batch */
// NOT batch-ready
dim3 block; dim3 block;
dim3 grid;
block.x = CudaNdarray_HOST_DIMS(%(h)s)[1]; block.x = CudaNdarray_HOST_DIMS(%(h)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[1]; block.y = CudaNdarray_HOST_DIMS(%(o)s)[1];
block.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; // batch size
SparseBlockGemv_fill_lists<<<block, 1>>>( int n = block.x*block.y*grid.z;
block.x*block.y*block.z, if (block.x > 32) {
grid.x = (block.x + 31)/32;
block.x = 32;
}
if (block.y > 32) {
grid.y = (block.y + 31)/32;
block.y = 32;
}
SparseBlockGemv_fill_lists<<<grid, block>>>(
n,
%(name)s_inp_list, %(name)s_inp_list,
%(name)s_out_list, %(name)s_out_list,
%(name)s_W_list, %(name)s_W_list,
...@@ -236,11 +254,21 @@ CudaNdarray_HOST_DIMS(%(o)s)[2], ...@@ -236,11 +254,21 @@ CudaNdarray_HOST_DIMS(%(o)s)[2],
} }
{ /* Perform final reduction and add biases */ { /* Perform final reduction and add biases */
dim3 block; dim3 block;
dim3 grid;
block.x = CudaNdarray_HOST_DIMS(%(o)s)[1]; block.x = CudaNdarray_HOST_DIMS(%(o)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(o)s)[2]; block.y = CudaNdarray_HOST_DIMS(%(o)s)[2];
block.z = CudaNdarray_HOST_DIMS(%(o)s)[0]; grid.z = CudaNdarray_HOST_DIMS(%(o)s)[0];
SparseBlockGemv_reduce<<<block, 1>>>( if (block.x > 32) {
grid.x = (block.x + 31)/32;
block.x = 32;
}
if (block.y > 32) {
grid.y = (block.y + 31)/32;
block.y = 32;
}
SparseBlockGemv_reduce<<<grid, block>>>(
CudaNdarray_HOST_DIMS(%(h)s)[1], CudaNdarray_HOST_DIMS(%(h)s)[1],
CudaNdarray_HOST_DIMS(%(o)s)[1], CudaNdarray_HOST_DIMS(%(o)s)[2],
%(name)s_outB, %(name)s_outB,
CudaNdarray_HOST_DIMS(%(h)s)[1] * CudaNdarray_HOST_DIMS(%(h)s)[1] *
CudaNdarray_HOST_DIMS(%(o)s)[1] * CudaNdarray_HOST_DIMS(%(o)s)[1] *
...@@ -259,7 +287,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[2]); ...@@ -259,7 +287,7 @@ CudaNdarray_HOST_STRIDES(%(out)s)[2]);
W=W, fail=sub['fail'], name=nodename) W=W, fail=sub['fail'], name=nodename)
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
def grad(self, inputs, grads): def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs o, W, h, inputIdx, outputIdx = inputs
...@@ -284,6 +312,14 @@ sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True) ...@@ -284,6 +312,14 @@ sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True)
class SparseBlockOuterSS(GpuOp): class SparseBlockOuterSS(GpuOp):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results.
This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient
of SparseBlockGemvSS.
"""
def __init__(self, inplace=False): def __init__(self, inplace=False):
self.inplace = inplace self.inplace = inplace
if self.inplace: if self.inplace:
...@@ -342,9 +378,10 @@ __global__ void _sgerBH_gen_small(const float *x[], int incx, ...@@ -342,9 +378,10 @@ __global__ void _sgerBH_gen_small(const float *x[], int incx,
const float *y[], int incy, const float *y[], int incy,
float alpha, float alpha,
float *A[], int lda, float *A[], int lda,
int b) { int b, int m, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y; int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i > m || j > n) return;
for (int p = blockIdx.z * blockDim.z + threadIdx.z; for (int p = blockIdx.z * blockDim.z + threadIdx.z;
p < b; p < b;
p += blockDim.z * gridDim.z) { p += blockDim.z * gridDim.z) {
...@@ -363,10 +400,18 @@ static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n, ...@@ -363,10 +400,18 @@ static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
dim3 grid(1, 1, batchCount); dim3 grid(1, 1, batchCount);
cublasPointerMode_t mode; cublasPointerMode_t mode;
cudaError_t err; cudaError_t err;
if (block.x > 32) {
grid.x = (block.x + 31)/32;
block.x = 32;
}
if (block.y > 32) {
grid.y = (block.y + 31)/32;
block.y = 32;
}
cublasGetPointerMode(handle, &mode); cublasGetPointerMode(handle, &mode);
if (mode == CUBLAS_POINTER_MODE_HOST) { if (mode == CUBLAS_POINTER_MODE_HOST) {
_sgerBH_gen_small<<<grid, block>>>(x, incx, y, incy, *alpha, A, lda, _sgerBH_gen_small<<<grid, block>>>(x, incx, y, incy, *alpha, A, lda,
batchCount); batchCount, m, n);
} else { } else {
return CUBLAS_STATUS_NOT_SUPPORTED; return CUBLAS_STATUS_NOT_SUPPORTED;
} }
...@@ -380,8 +425,8 @@ static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) { ...@@ -380,8 +425,8 @@ static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
cudaError_t err; cudaError_t err;
PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP); PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
if (aa == NULL) { return -1; } if (aa == NULL) { return -1; }
err = cudaMemcpy(b, PyArray_DATA(aa), PyArray_NBYTES(aa), err = cudaMemcpyAsync(b, PyArray_DATA(aa), PyArray_NBYTES(aa),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
Py_DECREF(aa); Py_DECREF(aa);
if (err != cudaSuccess) { if (err != cudaSuccess) {
PyErr_SetString(PyExc_RuntimeError, "Cannot copy index data to GPU"); PyErr_SetString(PyExc_RuntimeError, "Cannot copy index data to GPU");
...@@ -465,11 +510,21 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1) ...@@ -465,11 +510,21 @@ if (SparseBlockOuter_copy(%(yIdx)s, %(name)s_yIdx) == -1)
{ %(fail)s } { %(fail)s }
{ {
dim3 block; dim3 block;
dim3 grid;
block.x = CudaNdarray_HOST_DIMS(%(x)s)[1]; block.x = CudaNdarray_HOST_DIMS(%(x)s)[1];
block.y = CudaNdarray_HOST_DIMS(%(y)s)[1]; block.y = CudaNdarray_HOST_DIMS(%(y)s)[1];
block.z = CudaNdarray_HOST_DIMS(%(x)s)[0]; grid.z = CudaNdarray_HOST_DIMS(%(x)s)[0];
SparseBlockOuter_fill_lists<<<block, 1>>>( int n = block.x * block.y * grid.z;
block.x * block.y * block.z, if (block.x > 32) {
grid.x = (block.x + 31)/32;
block.x = 32;
}
if (block.y > 32) {
grid.y = (block.y + 31)/32;
block.y = 32;
}
SparseBlockOuter_fill_lists<<<grid, block>>>(
n,
%(name)s_x_list, %(name)s_x_list,
%(name)s_y_list, %(name)s_y_list,
%(name)s_out_list, %(name)s_out_list,
...@@ -497,14 +552,14 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -497,14 +552,14 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
CudaNdarray_HOST_DIMS(%(x)s)[1] * CudaNdarray_HOST_DIMS(%(x)s)[1] *
CudaNdarray_HOST_DIMS(%(y)s)[1]); CudaNdarray_HOST_DIMS(%(y)s)[1]);
if (err != CUBLAS_STATUS_SUCCESS) { if (err != CUBLAS_STATUS_SUCCESS) {
PyErr_SetString(PyExc_RuntimeError, "SgemmBatched failed"); PyErr_SetString(PyExc_RuntimeError, "SgerBatched failed");
%(fail)s %(fail)s
} }
}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name, }""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
alpha=alpha, fail=sub['fail']) alpha=alpha, fail=sub['fail'])
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
sparse_block_outer_ss = SparseBlockOuterSS(False) sparse_block_outer_ss = SparseBlockOuterSS(False)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论