提交 519706fa authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Add magma svd operation

上级 5ac884f6
......@@ -348,6 +348,72 @@ def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A)
class GpuMagmaSVD(COp):
"""Computes the svd of a matrix :math:`A` using magma library.
"""
__props__ = ('full_matrices', 'compute_uv')
params_type = gpu_context_type
def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices
self.compute_uv = compute_uv
COp.__init__(self, ['magma_svd.c'],
'APPLY_SPECIFIC(magma_svd)')
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def c_libraries(self):
return ['magma']
def make_node(self, A):
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
return theano.Apply(self, [A],
[A.type(),
GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
compute_uv = int(self.compute_uv)
full_matrices = int(self.full_matrices)
return [('COMPUTE_UV', compute_uv),
('FULL_MATRICES', full_matrices)]
def gpu_svd(a, full_matrices=1, compute_uv=1):
"""
This function performs the SVD on GPU.
Parameters
----------
full_matrices : bool, optional
If True (default), u and v have the shapes (M, M) and (N, N),
respectively.
Otherwise, the shapes are (M, K) and (K, N), respectively,
where K = min(M, N).
compute_uv : bool, optional
Whether or not to compute u and v in addition to s.
True by default.
Returns
-------
U, V, D : matrices
"""
return GpuMagmaSVD(full_matrices, compute_uv)(a)
class GpuMagmaMatrixInverse(COp):
"""Computes the inverse of a matrix :math:`A` using magma library.
"""
......
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
PyGpuArrayObject **S, PyGpuArrayObject **VT,
PyGpuContextObject *c) {
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
magma_vec_t jobu, jobv;
size_t s_dims[1], u_dims[2], vt_dims[2];
float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
*work = NULL;
float dummy[1];
int res = -1, lwork;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error");
goto fail;
}
// magma matrix svd
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&s_data, K)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (COMPUTE_UV) {
if (FULL_MATRICES) {
jobu = MagmaAllVec;
jobv = MagmaAllVec;
}
else {
jobu = MagmaSomeVec;
jobv = MagmaSomeVec;
}
M_U = (jobu == MagmaAllVec ? M : K);
N_VT = (jobv == MagmaAllVec ? N : K);
ldu = M;
ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M * M_U)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, ldv * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
}
else {
jobu = MagmaNoVec;
jobv = MagmaNoVec;
ldu = M;
ldv = N;
}
// query for workspace size
magma_sgesvd(jobu, jobv, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
dummy, -1, &info);
lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate working memory");
goto fail;
}
// compute svd
magma_sgesvd(jobu, jobv, M, N, a_data, M, s_data,
u_data, ldu, vt_data, ldv, work, lwork, &info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaSVD: magma_sgesvd_gpu %d superdiagonals failed to converge",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaSVD: magma_sgesvd_gpu argument %d has an illegal value", -info);
goto fail;
}
s_dims[0] = K;
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice);
u_dims[0] = M; u_dims[1] = ldu;
// choose fortran order to avoid transpose
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*U), u_data, M * ldu * sizeof(float),
cudaMemcpyDeviceToDevice);
/* GpuArray_transpose_inplace(&(*U)->ga, NULL); */
vt_dims[0] = ldv; vt_dims[1] = N;
// choose fortran order to avoid transpose
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), vt_data, ldv * N * sizeof(float),
cudaMemcpyDeviceToDevice);
/* GpuArray_transpose_inplace(&(*VT)->ga, NULL); */
res = 0;
fail:
if (a_data != NULL)
magma_free_pinned(a_data);
if (s_data != NULL)
magma_free_pinned(s_data);
if (u_data != NULL)
magma_free_pinned(u_data);
if (vt_data != NULL)
magma_free_pinned(vt_data);
if (work != NULL)
magma_free_pinned(work);
magma_finalize();
cuda_exit(c->ctx);
return res;
}
......@@ -72,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse)
cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -2013,6 +2013,15 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
return GpuMagmaMatrixInverse()
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.SVD])
@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
def local_gpu_svd(op, context_name, inputs, outputs):
import pudb; pudb.set_trace()
return GpuMagmaSVD(full_matrices=op.full_matrices,
compute_uv=op.compute_uv)
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace',
......
......@@ -5,13 +5,13 @@ import numpy as np
import theano
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu
from .config import mode_with_gpu, mode_without_gpu
from numpy.linalg.linalg import LinAlgError
# Skip tests if cuda_ndarray is not available.
from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky,
gpu_matrix_inverse)
gpu_matrix_inverse, gpu_svd)
class TestCusolver(unittest.TestCase):
......@@ -201,8 +201,30 @@ class TestMagma(unittest.TestCase):
def test_gpu_matrix_inverse(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu.including('magma'))
N = 1000
A_val = np.random.rand(N, N).astype(np.float32)
A_val_inv = fn(A_val)
utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
def test_gpu_svd(self):
A = theano.tensor.fmatrix("A")
N, M = 50, 100
A_val = np.random.rand(M, N).astype(np.float32)
f = theano.function([A], gpu_svd(A), mode=mode_with_gpu.including('magma'))
U, S, VT = f(A_val)
utt.assert_allclose(np.dot(U.T, U), np.eye(M))
utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
S_m = np.zeros_like(A_val)
np.fill_diagonal(S_m, S)
utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A_val)
f = theano.function([A], gpu_svd(A, full_matrices=False), mode=mode_with_gpu.including('magma'))
U, _, VT = f(A_val)
utt.assert_allclose(np.dot(U.T, U), np.eye(N))
utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
f = theano.function([A], theano.tensor.nlinalg.svd(A, compute_uv=False), mode=mode_without_gpu)
f2 = theano.function([A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu.including('magma'))
utt.assert_allclose(f(A_val)[1], f2(A_val)[1], 1)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论