提交 e76d05d6 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5914 from aam-at/magma_ops

Magma gpu QR/Cholesky/Eigh
差异被折叠。
#section kernels
#kernel tril_kernel : size, size, size, *:
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix < iy) {
a[index] = 0.0;
}
}
}
#kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
PARAMS_TYPE *params) {
const size_t *dims;
size_t N, n2;
magma_uplo_t ul;
int res = -1, info;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaCholesky: unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaCholesky: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
goto fail;
}
dims = PyGpuArray_DIMS(A);
if (dims[0] != dims[1]) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
goto fail;
}
if (params->inplace) {
Py_XDECREF(*L);
*L = A;
Py_INCREF(*L);
} else {
*L = theano_try_copy(*L, A);
if (*L == NULL) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaCholesky: failed to allocate memory for the output");
goto fail;
}
}
// magma matrix cholesky
N = dims[0];
n2 = N * N;
// Magma requires column-major order for the matrix A. Instead of changing
// matrix order which requires copying data, we can compute cholesky
// decomposition where we change parameters lower to upper and upper to
// lower.
if (params->lower) {
ul = MagmaUpper;
}
else {
ul = MagmaLower;
}
magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
if (info > 0) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
"order %d is not positive definite",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
-info);
goto fail;
}
if (params->lower) {
res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
GpuKernel_error(&k_tril_kernel, res));
goto fail;
}
} else {
res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
}
res = 0;
fail:
cuda_exit(params->context->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
PyGpuArrayObject **D,
PyGpuArrayObject **V, // may be NULL
PARAMS_TYPE *params) {
PyGpuArrayObject *A = NULL;
magma_int_t N, liwork, *iwork_data = NULL;
size_t d_dims[1], v_dims[2];
magma_uplo_t uplo;
magma_vec_t jobz;
float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
int res = -1, info;
if (A_->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaEigh: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A_) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix rank error");
goto fail;
}
if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix is not square");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to change to column-major order");
return -1;
}
// magma matrix eigen decomposition of a symmetric matrix
N = PyGpuArray_DIM(A, 0);
if (params->lower) {
uplo = MagmaLower;
} else {
uplo = MagmaUpper;
}
if (params->compute_v) {
jobz = MagmaVec;
} else {
jobz = MagmaNoVec;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
// query for workspace size
magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
-1, &liwork, -1, &info);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
&info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: %d off-diagonal elements of an didn't converge to zero",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
goto fail;
}
d_dims[0] = N;
if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (params->compute_v) {
*V = theano_try_copy(*V, A);
if (*V == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
}
res = 0;
fail:
if (w_data != NULL)
magma_free_pinned(w_data);
if (wA_data != NULL)
magma_free_pinned(wA_data);
if (work_data != NULL)
magma_free_pinned(work_data);
if (iwork_data != NULL)
magma_free_cpu(iwork_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
...@@ -5,7 +5,7 @@ setup_ext_cuda(); ...@@ -5,7 +5,7 @@ setup_ext_cuda();
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv, int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
PARAMS_TYPE* params) { PARAMS_TYPE *params) {
const size_t *dims; const size_t *dims;
magma_int_t N, ldwork, info; magma_int_t N, ldwork, info;
magma_int_t *piv = NULL; magma_int_t *piv = NULL;
...@@ -20,12 +20,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv, ...@@ -20,12 +20,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
// This is early to match the exit() in the fail label. // This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx); cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous"); "GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1; goto fail;
} }
if (PyGpuArray_NDIM(A) != 2) { if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -93,7 +92,6 @@ fail: ...@@ -93,7 +92,6 @@ fail:
magma_free(piv); magma_free(piv);
if (dwork != NULL) if (dwork != NULL)
gpudata_release(dwork); gpudata_release(dwork);
magma_finalize();
cuda_exit(params->context->ctx); cuda_exit(params->context->ctx);
return res; return res;
} }
#section kernels
#kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size ix = index / ncols;
const ga_size iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code
static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
size_t size) {
PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
src_view->ga.dimensions[dim] = size;
GpuArray_fix_flags(&src_view->ga);
return pygpu_copy(src_view, GA_C_ORDER);
}
#section support_code_struct
int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
PyGpuArrayObject **R,
PyGpuArrayObject **Q, // may be NULL
PARAMS_TYPE* params) {
PyGpuArrayObject *A = NULL;
magma_int_t M, N, K, nb, ldwork;
size_t n2;
float *tau_data = NULL;
gpudata *work_data = NULL;
int res = -1, info;
A = A_;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaQR: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaQR: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
goto fail;
}
// magma matrix qr
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
nb = magma_get_sgeqrf_nb(M, N);
ldwork = (2 * K + magma_roundup(N, 32)) * nb;
work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (work_data == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
// compute R
magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
goto fail;
}
*R = pygpu_narrow(A, 0, K);
if (*R == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
n2 = K * N;
res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
if (params->complete) {
// compute Q
Py_XDECREF(A);
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
return -1;
}
magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
goto fail;
}
magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, nb, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
goto fail;
}
*Q = pygpu_narrow(A, 1, K);
if (*Q == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
}
res = 0;
fail:
if (tau_data != NULL)
magma_free_pinned(tau_data);
if (work_data != NULL)
gpudata_release(work_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
...@@ -8,7 +8,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -8,7 +8,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
PyGpuArrayObject **S, PyGpuArrayObject **S,
PyGpuArrayObject **U, // may be NULL PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, // may be NULL PyGpuArrayObject **VT, // may be NULL
PARAMS_TYPE* params) { PARAMS_TYPE *params) {
bool compute_uv = (U != NULL); bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1]; magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info; magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
...@@ -21,22 +21,21 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -21,22 +21,21 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
if (A->ga.typecode != GA_FLOAT) { if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type"); "GpuMagmaSVD: Unsupported data type");
return -1; return -1;
} }
// This is early to match the exit() in the fail label. // This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx); cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous"); "GpuMagmaSVD: requires data to be C-contiguous");
return 1; goto fail;
} }
if (PyGpuArray_NDIM(A) != 2) { if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error"); "GpuMagmaSVD: matrix rank error");
goto fail; goto fail;
} }
...@@ -44,7 +43,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -44,7 +43,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
// reverse dimensions because MAGMA expects column-major matrices: // reverse dimensions because MAGMA expects column-major matrices:
M = PyGpuArray_DIM(A, 1); M = PyGpuArray_DIM(A, 1);
N = PyGpuArray_DIM(A, 0); N = PyGpuArray_DIM(A, 0);
K = std::min(M, N); K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) { if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
...@@ -166,7 +165,6 @@ fail: ...@@ -166,7 +165,6 @@ fail:
magma_free_pinned(work); magma_free_pinned(work);
if (iwork != NULL) if (iwork != NULL)
magma_free_cpu(iwork); magma_free_cpu(iwork);
magma_finalize();
cuda_exit(params->context->ctx); cuda_exit(params->context->ctx);
return res; return res;
} }
...@@ -74,7 +74,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -74,7 +74,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, gpu_svd) cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
GpuMagmaCholesky, gpu_qr, GpuMagmaEigh)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -2112,9 +2113,6 @@ def local_inplace_gpu_solve(node): ...@@ -2112,9 +2113,6 @@ def local_inplace_gpu_solve(node):
# Cholesky decomposition # Cholesky decomposition
@register_opt('fast_compile')
@op_lifter([slinalg.Cholesky])
@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
def local_gpu_cholesky(op, context_name, inputs, outputs): def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available: if not cusolver_available:
return return
...@@ -2125,19 +2123,99 @@ def local_gpu_cholesky(op, context_name, inputs, outputs): ...@@ -2125,19 +2123,99 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
return op(inputs[0].astype('float32')).astype('float16') return op(inputs[0].astype('float32')).astype('float16')
return op return op
matrix_ops_db = LocalGroupDB()
matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
matrix_ops_db2.__name__ = "matrix_ops_db2"
# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
# matrices of size 1000). Thus, cusolver is prioritized during graph
# optimizations. To explicitly use magma, you should disable cusolver using
# `optimizer_excluding=cusolver` in Theano config.
lifter = op_lifter([slinalg.Cholesky])(local_gpu_cholesky)
matrix_ops_db.register("local_gpu_cholesky", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
position=0)
matrix_ops_db2.register("local_gpu_cholesky",
local_optimizer([slinalg.Cholesky])(local_gpu_cholesky),
'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
position=0)
register_opt('fast_compile', name='matrix_ops_db')(matrix_ops_db)
register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops_db2)
@register_inplace() @register_inplace()
@local_optimizer([GpuCholesky], inplace=True) @local_optimizer([GpuCholesky], inplace=True)
def local_inplace_cholesky(node): def local_inplace_gpu_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace: if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)] return [node.op.clone_inplace()(*node.inputs)]
def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
lifter = op_lifter([slinalg.Cholesky])(local_gpu_magma_cholesky)
matrix_ops_db.register("local_gpu_magma_cholesky", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'magma',
position=1)
matrix_ops_db2.register("local_gpu_magma_cholesky",
local_optimizer([slinalg.Cholesky])(local_gpu_magma_cholesky),
'gpuarray', 'fast_compile', 'fast_run', 'magma',
position=1)
@register_inplace()
@local_optimizer([GpuMagmaCholesky], inplace=True)
def local_inplace_gpu_magma_cholesky(node):
if isinstance(node.op, GpuMagmaCholesky) and not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)]
# QR decomposition
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.QRFull])
@register_opt2([theano.tensor.nlinalg.QRFull], 'magma', 'fast_compile')
def local_gpu_magma_qr(op, context_name, inputs, outputs):
if not config.magma.enabled or op.mode != 'reduced':
return
if inputs[0].dtype not in ['float16', 'float32']:
return
x = inputs[0]
if inputs[0].dtype == 'float16':
x = inputs[0].astype('float32')
out = gpu_qr(x, complete=True)
if inputs[0].dtype == 'float16':
return [o.astype('float16') for o in out]
return out
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.QRIncomplete])
@register_opt2([theano.tensor.nlinalg.QRIncomplete], 'magma', 'fast_compile')
def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
x = inputs[0]
if inputs[0].dtype == 'float16':
x = inputs[0].astype('float32')
out = gpu_qr(x, complete=False)
if inputs[0].dtype == 'float16':
return [out.astype('float16')]
return out
# Matrix inverse
@register_opt('magma', 'fast_compile') @register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.MatrixInverse]) @op_lifter([nlinalg.MatrixInverse])
@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile') @register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
def local_gpu_matrix_inverse(op, context_name, inputs, outputs): def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
if not config.magma.enabled: if not config.magma.enabled:
return return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32']:
...@@ -2150,16 +2228,31 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs): ...@@ -2150,16 +2228,31 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
@register_inplace() @register_inplace()
@local_optimizer([GpuMagmaMatrixInverse]) @local_optimizer([GpuMagmaMatrixInverse])
def local_inplace_matrix_inverse_inplace(node): def local_inplace_gpu_magma_matrix_inverse(node):
if isinstance(node.op, GpuMagmaMatrixInverse): if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
if not node.op.inplace: return [node.op.clone_inplace()(*node.inputs)]
return [node.op.clone_inplace()(*node.inputs)]
# Eigen decomposition of a symmetric matrix
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.Eigh])
@register_opt2([theano.tensor.nlinalg.Eigh], 'magma', 'fast_compile')
def local_gpu_magma_eigh(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaEigh(UPLO=op.UPLO, compute_v=True)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
# Singular Value Decomposition
@register_opt('magma', 'fast_compile') @register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.SVD]) @op_lifter([nlinalg.SVD])
@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile') @register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
def local_gpu_svd(op, context_name, inputs, outputs): def local_gpu_magma_svd(op, context_name, inputs, outputs):
if not config.magma.enabled: if not config.magma.enabled:
return return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32']:
......
...@@ -7,10 +7,14 @@ from numpy.linalg.linalg import LinAlgError ...@@ -7,10 +7,14 @@ from numpy.linalg.linalg import LinAlgError
import theano import theano
from theano import config from theano import config
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse, from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
GpuMagmaEigh, GpuMagmaMatrixInverse,
GpuMagmaQR, GpuMagmaSVD,
cusolver_available, gpu_matrix_inverse, cusolver_available, gpu_matrix_inverse,
gpu_solve, gpu_svd) gpu_solve, gpu_svd, gpu_qr)
from theano.tensor.nlinalg import matrix_inverse from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
QRIncomplete, eigh, matrix_inverse, qr)
from theano.tensor.slinalg import Cholesky, cholesky
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
...@@ -132,7 +136,8 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -132,7 +136,8 @@ class TestGpuCholesky(unittest.TestCase):
A = theano.tensor.matrix("A", dtype="float32") A = theano.tensor.matrix("A", dtype="float32")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace) cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A) chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu) return theano.function([A], chol_A, accept_inplace=inplace,
mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False): def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output. # Helper function to compare op output to np.cholesky output.
...@@ -144,6 +149,12 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -144,6 +149,12 @@ class TestGpuCholesky(unittest.TestCase):
chol_A_res = np.array(res) chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val) utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
A = theano.tensor.matrix("A", dtype="float32")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuCholesky)
for node in fn.maker.fgraph.toposort()])
def test_invalid_input_fail_non_square(self): def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input. # Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float32") A_val = np.random.normal(size=(3, 2)).astype("float32")
...@@ -207,6 +218,20 @@ class TestMagma(unittest.TestCase): ...@@ -207,6 +218,20 @@ class TestMagma(unittest.TestCase):
if not config.magma.enabled: if not config.magma.enabled:
self.skipTest('Magma is not enabled, skipping test') self.skipTest('Magma is not enabled, skipping test')
def test_magma_opt_float16(self):
ops_to_gpu = [(MatrixInverse(), GpuMagmaMatrixInverse),
(SVD(), GpuMagmaSVD),
(QRFull(mode='reduced'), GpuMagmaQR),
(QRIncomplete(mode='r'), GpuMagmaQR),
# TODO: add support for float16 to Eigh numpy
# (Eigh(), GpuMagmaEigh),
(Cholesky(), GpuMagmaCholesky)]
for op, gpu_op in ops_to_gpu:
A = theano.tensor.matrix("A", dtype="float16")
fn = theano.function([A], op(A), mode=mode_with_gpu.excluding('cusolver'))
assert any([isinstance(node.op, gpu_op)
for node in fn.maker.fgraph.toposort()])
def test_gpu_matrix_inverse(self): def test_gpu_matrix_inverse(self):
A = theano.tensor.fmatrix("A") A = theano.tensor.fmatrix("A")
...@@ -216,7 +241,7 @@ class TestMagma(unittest.TestCase): ...@@ -216,7 +241,7 @@ class TestMagma(unittest.TestCase):
# Copied from theano.tensor.tests.test_basic.rand. # Copied from theano.tensor.tests.test_basic.rand.
A_val = test_rng.rand(N, N).astype('float32') * 2 - 1 A_val = test_rng.rand(N, N).astype('float32') * 2 - 1
A_val_inv = fn(A_val) A_val_inv = fn(A_val)
utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=5e-3) utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=1e-2)
def test_gpu_matrix_inverse_inplace(self): def test_gpu_matrix_inverse_inplace(self):
N = 1000 N = 1000
...@@ -296,3 +321,132 @@ class TestMagma(unittest.TestCase): ...@@ -296,3 +321,132 @@ class TestMagma(unittest.TestCase):
A_val = rand(100, 50).astype('float32') A_val = rand(100, 50).astype('float32')
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val)) utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def run_gpu_cholesky(self, A_val, lower=True):
A = theano.tensor.fmatrix("A")
f = theano.function([A], GpuMagmaCholesky(lower=lower)(A),
mode=mode_with_gpu.excluding('cusolver'))
return f(A_val)
def rand_symmetric(self, N):
A = rand(N, N).astype('float32')
# ensure that eigenvalues are not too small which sometimes results in
# magma cholesky failure due to gpu limited numerical precision
D, W = np.linalg.eigh(A)
D[D < 1] = 1
V_m = np.zeros_like(A)
np.fill_diagonal(V_m, D)
return np.dot(np.dot(W.T, V_m), W)
def check_cholesky(self, N, lower=True, rtol=None, atol=None):
A = self.rand_symmetric(N)
L = self.run_gpu_cholesky(A, lower=lower)
if not lower:
L = L.T
utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
def test_gpu_cholesky(self):
self.check_cholesky(1000, atol=1e-3)
self.check_cholesky(1000, lower=False, atol=1e-3)
def test_gpu_cholesky_opt(self):
A = theano.tensor.matrix("A", dtype="float32")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu.excluding('cusolver'))
assert any([isinstance(node.op, GpuMagmaCholesky)
for node in fn.maker.fgraph.toposort()])
def test_gpu_cholesky_inplace(self):
A = self.rand_symmetric(1000)
A_gpu = gpuarray_shared_constructor(A)
A_copy = A_gpu.get_value()
fn = theano.function([], GpuMagmaCholesky(inplace=True)(A_gpu),
mode=mode_with_gpu, accept_inplace=True)
fn()
L = A_gpu.get_value()
utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
def test_gpu_cholesky_inplace_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], GpuMagmaCholesky()(A), mode=mode_with_gpu)
assert any([
node.op.inplace
for node in fn.maker.fgraph.toposort() if
isinstance(node.op, GpuMagmaCholesky)
])
def run_gpu_qr(self, A_val, complete=True):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], gpu_qr(A, complete=complete),
mode=mode_with_gpu)
return fn(A_val)
def check_gpu_qr(self, M, N, complete=True, rtol=None, atol=None):
A = rand(M, N).astype('float32')
if complete:
Q_gpu, R_gpu = self.run_gpu_qr(A, complete=complete)
else:
R_gpu = self.run_gpu_qr(A, complete=complete)
Q_np, R_np = np.linalg.qr(A, mode='reduced')
utt.assert_allclose(R_np, R_gpu, rtol=rtol, atol=atol)
if complete:
utt.assert_allclose(Q_np, Q_gpu, rtol=rtol, atol=atol)
def test_gpu_qr(self):
self.check_gpu_qr(1000, 500, atol=1e-3)
self.check_gpu_qr(1000, 500, complete=False, atol=1e-3)
self.check_gpu_qr(500, 1000, atol=1e-3)
self.check_gpu_qr(500, 1000, complete=False, atol=1e-3)
def test_gpu_qr_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], qr(A), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaQR) and node.op.complete
for node in fn.maker.fgraph.toposort()
])
def test_gpu_qr_incomplete_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], qr(A, mode='r'), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaQR) and not node.op.complete
for node in fn.maker.fgraph.toposort()
])
def run_gpu_eigh(self, A_val, UPLO='L', compute_v=True):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], GpuMagmaEigh(UPLO=UPLO, compute_v=compute_v)(A),
mode=mode_with_gpu)
return fn(A_val)
def check_gpu_eigh(self, N, UPLO='L', compute_v=True, rtol=None, atol=None):
A = rand(N, N).astype('float32')
A = np.dot(A.T, A)
d_np, v_np = np.linalg.eigh(A, UPLO=UPLO)
if compute_v:
d_gpu, v_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=compute_v)
else:
d_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=False)
utt.assert_allclose(d_np, d_gpu, rtol=rtol, atol=atol)
if compute_v:
utt.assert_allclose(
np.eye(N), np.dot(v_gpu, v_gpu.T), rtol=rtol, atol=atol)
D_m = np.zeros_like(A)
np.fill_diagonal(D_m, d_gpu)
utt.assert_allclose(
A, np.dot(np.dot(v_gpu, D_m), v_gpu.T), rtol=rtol, atol=atol)
def test_gpu_eigh(self):
self.check_gpu_eigh(1000, UPLO='L', atol=1e-3)
self.check_gpu_eigh(1000, UPLO='U', atol=1e-3)
self.check_gpu_eigh(1000, UPLO='L', compute_v=False, atol=1e-3)
self.check_gpu_eigh(1000, UPLO='U', compute_v=False, atol=1e-3)
def test_gpu_eigh_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], eigh(A), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaEigh)
for node in fn.maker.fgraph.toposort()
])
...@@ -539,7 +539,7 @@ class QRIncomplete(Op): ...@@ -539,7 +539,7 @@ class QRIncomplete(Op):
Incomplete QR Decomposition. Incomplete QR Decomposition.
Computes the QR decomposition of a matrix. Computes the QR decomposition of a matrix.
Factor the matrix a as qr and return a single matrix. Factor the matrix a as qr and return a single matrix R.
""" """
...@@ -552,15 +552,14 @@ class QRIncomplete(Op): ...@@ -552,15 +552,14 @@ class QRIncomplete(Op):
def make_node(self, x): def make_node(self, x):
x = as_tensor_variable(x) x = as_tensor_variable(x)
assert x.ndim == 2, "The input of qr function should be a matrix." assert x.ndim == 2, "The input of qr function should be a matrix."
q = theano.tensor.matrix(dtype=x.dtype) r = theano.tensor.matrix(dtype=x.dtype)
return Apply(self, [x], [q]) return Apply(self, [x], [r])
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
(x,) = inputs (x,) = inputs
(q,) = outputs (r,) = outputs
assert x.ndim == 2, "The input of qr function should be a matrix." assert x.ndim == 2, "The input of qr function should be a matrix."
q[0] = self._numop(x, r[0] = self._numop(x, self.mode)
self.mode)
def qr(a, mode="reduced"): def qr(a, mode="reduced"):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论