提交 5e9c7bce authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Make a custom ger kernel that uses atomicAdd to do the addition

Remove the beta parameter since it's always 1 anyway.
上级 52cd5ee4
...@@ -298,16 +298,14 @@ class SparseBlockOuterSS(GpuOp): ...@@ -298,16 +298,14 @@ class SparseBlockOuterSS(GpuOp):
def __str__(self): def __str__(self):
return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "") return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, x, y, xIdx, yIdx, alpha=None, beta=None): def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
one = tensor.constant(numpy.asarray(1.0, dtype='float32')) one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = basic_ops.as_cuda_ndarray_variable(o) o = basic_ops.as_cuda_ndarray_variable(o)
x = basic_ops.as_cuda_ndarray_variable(x) x = basic_ops.as_cuda_ndarray_variable(x)
y = basic_ops.as_cuda_ndarray_variable(y) y = basic_ops.as_cuda_ndarray_variable(y)
if alpha is None: if alpha is None:
alpha = one alpha = one
if beta is None: return Apply(self, [o, x, y, xIdx, yIdx, alpha],
beta = one
return Apply(self, [o, x, y, xIdx, yIdx, alpha, beta],
[o.type()]) [o.type()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
...@@ -339,6 +337,45 @@ const npy_intp *yIdx, int yI_str_0 ...@@ -339,6 +337,45 @@ const npy_intp *yIdx, int yI_str_0
yIdx[b * yI_str_0 + j] * o_str_1]; yIdx[b * yI_str_0 + j] * o_str_1];
} }
/* This is tuned for smaller sizes (< 512) since it's what we get normally */
__global__ void _sgerBH_gen_small(const float *x[], int incx,
const float *y[], int incy,
float alpha,
float *A[], int lda,
int b) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
for (int p = blockIdx.z * blockDim.z + threadIdx.z;
p < b;
p += blockDim.z * gridDim.z) {
atomicAdd(&A[p][j * lda + i],
alpha * x[p][i * incx] * y[p][j * incy]);
}
}
static cublasStatus_t SgerBatched(cublasHandle_t handle, int m, int n,
const float *alpha,
const float *x[], int incx,
const float *y[], int incy,
float *A[], int lda,
int batchCount) {
dim3 block(m, n, 1);
dim3 grid(1, 1, batchCount);
cublasPointerMode_t mode;
cudaError_t err;
cublasGetPointerMode(handle, &mode);
if (mode == CUBLAS_POINTER_MODE_HOST) {
_sgerBH_gen_small<<<grid, block>>>(x, incx, y, incy, *alpha, A, lda,
batchCount);
} else {
return CUBLAS_STATUS_NOT_SUPPORTED;
}
err = cudaGetLastError();
if (err != cudaSuccess)
return CUBLAS_STATUS_EXECUTION_FAILED;
return CUBLAS_STATUS_SUCCESS;
}
static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) { static int SparseBlockOuter_copy(PyArrayObject *a, npy_intp *b) {
cudaError_t err; cudaError_t err;
PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP); PyArrayObject *aa = (PyArrayObject *)PyArray_Cast(a, NPY_INTP);
...@@ -394,7 +431,7 @@ static int %(n)s_prep(int b, int i, int j) { ...@@ -394,7 +431,7 @@ static int %(n)s_prep(int b, int i, int j) {
""" % dict(n=name) """ % dict(n=name)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
o, x, y, xIdx, yIdx, alpha, beta = inputs o, x, y, xIdx, yIdx, alpha = inputs
out = outputs[0] out = outputs[0]
if self.inplace: if self.inplace:
res = """ res = """
...@@ -445,16 +482,16 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -445,16 +482,16 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
} }
{ {
cublasStatus_t err; cublasStatus_t err;
int str_y = CudaNdarray_HOST_STRIDES(%(y)s)[1]; int str_y = CudaNdarray_HOST_STRIDES(%(y)s)[2];
if (str_y == 0) str_y = 1; if (str_y == 0) str_y = 1;
int str_x = CudaNdarray_HOST_STRIDES(%(x)s)[1]; int str_x = CudaNdarray_HOST_STRIDES(%(x)s)[2];
if (str_x == 0) str_x = 1; if (str_x == 0) str_x = 1;
int str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2]; int str_out = CudaNdarray_HOST_STRIDES(%(out)s)[2];
if (str_out == 0) str_out = 1; if (str_out == 0) str_out = 1;
err = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_T, err = SgerBatched(handle,
CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2], 1, CudaNdarray_HOST_DIMS(%(y)s)[2], CudaNdarray_HOST_DIMS(%(x)s)[2],
(float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list, str_y, (float *)PyArray_GETPTR1(%(alpha)s, 0), %(name)s_y_list, str_y,
%(name)s_x_list, str_x, (float *)PyArray_GETPTR1(%(beta)s, 0), %(name)s_x_list, str_x,
%(name)s_out_list, str_out, %(name)s_out_list, str_out,
CudaNdarray_HOST_DIMS(%(x)s)[0] * CudaNdarray_HOST_DIMS(%(x)s)[0] *
CudaNdarray_HOST_DIMS(%(x)s)[1] * CudaNdarray_HOST_DIMS(%(x)s)[1] *
...@@ -464,10 +501,10 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -464,10 +501,10 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(fail)s %(fail)s
} }
}""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name, }""" % dict(x=x, y=y, out=out, xIdx=xIdx, yIdx=yIdx, name=name,
alpha=alpha, beta=beta, fail=sub['fail']) alpha=alpha, fail=sub['fail'])
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
sparse_block_outer_ss = SparseBlockOuterSS(False) sparse_block_outer_ss = SparseBlockOuterSS(False)
...@@ -537,8 +574,7 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr) ...@@ -537,8 +574,7 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
if lr is None or ger is None: if lr is None or ger is None:
return None return None
alpha = lr * ger.inputs[5] alpha = lr * ger.inputs[5]
return [sparse_block_outer_ss(*(ger.inputs[:5] + return [sparse_block_outer_ss(*(ger.inputs[:5] + [alpha]))]
[alpha, ger.inputs[6]]))]
@opt.register_opt() @opt.register_opt()
@opt.local_optimizer([GpuElemwise]) @opt.local_optimizer([GpuElemwise])
...@@ -554,7 +590,7 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr) ...@@ -554,7 +590,7 @@ GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
if ger is None: if ger is None:
return None return None
return [sparse_block_outer_ss(*([W] + ger.inputs[1:5] + return [sparse_block_outer_ss(*([W] + ger.inputs[1:5] +
[-ger.inputs[5], ger.inputs[6]]))] [-ger.inputs[5]]))]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx): def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论