提交 ed25d565 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Add offset for triu and tril kernels

上级 7b09a215
#section kernels #section kernels
#kernel tril_kernel : size, size, *: #kernel tril_kernel : size, size, size, *:
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads; for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) { index += LDIM_0 * GDIM_0) {
...@@ -15,10 +16,11 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols, ...@@ -15,10 +16,11 @@ KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
} }
} }
#kernel triu_kernel : size, size, *: #kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads; for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) { index += LDIM_0 * GDIM_0) {
...@@ -108,14 +110,14 @@ int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L, ...@@ -108,14 +110,14 @@ int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
} }
#ifdef LOWER #ifdef LOWER
res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.data); res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) { if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.", PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
GpuKernel_error(&k_tril_kernel, res)); GpuKernel_error(&k_tril_kernel, res));
goto fail; goto fail;
} }
#else #else
res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.data); res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) { if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.", PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res)); GpuKernel_error(&k_triu_kernel, res));
......
#section kernels #section kernels
#kernel triu_kernel : size, size, *: #kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols, KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
GLOBAL_MEM DTYPE_INPUT_0 *a) { const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping // grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads; for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) { index += LDIM_0 * GDIM_0) {
...@@ -103,7 +104,7 @@ int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_, ...@@ -103,7 +104,7 @@ int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
goto fail; goto fail;
} }
n2 = K * N; n2 = K * N;
res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.data); res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
if (res != GA_NO_ERROR) { if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.", PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res)); GpuKernel_error(&k_triu_kernel, res));
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论