提交 e8b2f369 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Add gpu magma qr decomposition

上级 1f9cc65c
......@@ -581,3 +581,58 @@ class GpuMagmaCholesky(CGpuKernelBase):
def infer_shape(self, node, shapes):
return [shapes[0]]
class GpuMagmaQR(CGpuKernelBase):
"""Computes the qr decomposition of a matrix :math:`A` using magma
library.
Parameters
----------
complete : If `False`, returns only r.
"""
params_type = gpu_context_type
__props__ = ('complete',)
def __init__(self, complete=True):
self.complete = complete
COp.__init__(self, ['magma_qr.c'], 'APPLY_SPECIFIC(magma_qr)')
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if self.complete:
return theano.Apply(self, [A], [A.type(), A.type()])
else:
return theano.Apply(self, [A], [A.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
params = []
if self.complete:
params.append(('COMPLETE', '1'))
return params
#section kernels
#kernel triu_kernel : size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
GLOBAL_MEM DTYPE_INPUT_0 *a) {
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size ix = index / ncols;
const ga_size iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code
static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
size_t size) {
PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
src_view->ga.dimensions[dim] = size;
return pygpu_copy(src_view, GA_C_ORDER);
}
#section support_code_struct
int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
#ifdef COMPLETE
PyGpuArrayObject **Q,
#endif
PyGpuArrayObject **R,
PyGpuContextObject *c) {
PyGpuArrayObject *A = NULL;
magma_int_t M, N, K, nb, ldwork;
size_t n2;
float *tau_data = NULL;
gpudata *work_data = NULL;
int res = -1, info;
A = A_;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaQR: Unsupported data type");
return -1;
}
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaQR: requires data to be C-contiguous");
return -1;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
return -1;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
magma_init();
// magma matrix qr
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = std::min(M, N);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
nb = magma_get_sgeqrf_nb(M, N);
ldwork = (2 * K + magma_roundup(N, 32)) * nb;
work_data = gpudata_alloc(c->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (work_data == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
// compute R
magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
goto fail;
}
*R = pygpu_narrow(A, 0, K);
if (*R == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
n2 = K * N;
res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
#ifdef COMPLETE
// compute Q
Py_XDECREF(A);
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
return -1;
}
magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
goto fail;
}
magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, nb, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
goto fail;
}
*Q = pygpu_narrow(A, 1, K);
if (*Q == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
#endif
res = 0;
fail:
if (tau_data != NULL)
magma_free_pinned(tau_data);
if (work_data != NULL)
gpudata_release(work_data);
magma_finalize();
cuda_exit(c->ctx);
return res;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论