提交 e76d05d6 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5914 from aam-at/magma_ops

Magma gpu QR/Cholesky/Eigh
差异被折叠。
#section kernels
#kernel tril_kernel : size, size, size, *:
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix < iy) {
a[index] = 0.0;
}
}
}
#kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
PARAMS_TYPE *params) {
const size_t *dims;
size_t N, n2;
magma_uplo_t ul;
int res = -1, info;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaCholesky: unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaCholesky: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
goto fail;
}
dims = PyGpuArray_DIMS(A);
if (dims[0] != dims[1]) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
goto fail;
}
if (params->inplace) {
Py_XDECREF(*L);
*L = A;
Py_INCREF(*L);
} else {
*L = theano_try_copy(*L, A);
if (*L == NULL) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaCholesky: failed to allocate memory for the output");
goto fail;
}
}
// magma matrix cholesky
N = dims[0];
n2 = N * N;
// Magma requires column-major order for the matrix A. Instead of changing
// matrix order which requires copying data, we can compute cholesky
// decomposition where we change parameters lower to upper and upper to
// lower.
if (params->lower) {
ul = MagmaUpper;
}
else {
ul = MagmaLower;
}
magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
if (info > 0) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
"order %d is not positive definite",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
-info);
goto fail;
}
if (params->lower) {
res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
GpuKernel_error(&k_tril_kernel, res));
goto fail;
}
} else {
res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
}
res = 0;
fail:
cuda_exit(params->context->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
PyGpuArrayObject **D,
PyGpuArrayObject **V, // may be NULL
PARAMS_TYPE *params) {
PyGpuArrayObject *A = NULL;
magma_int_t N, liwork, *iwork_data = NULL;
size_t d_dims[1], v_dims[2];
magma_uplo_t uplo;
magma_vec_t jobz;
float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
int res = -1, info;
if (A_->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaEigh: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A_) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix rank error");
goto fail;
}
if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix is not square");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to change to column-major order");
return -1;
}
// magma matrix eigen decomposition of a symmetric matrix
N = PyGpuArray_DIM(A, 0);
if (params->lower) {
uplo = MagmaLower;
} else {
uplo = MagmaUpper;
}
if (params->compute_v) {
jobz = MagmaVec;
} else {
jobz = MagmaNoVec;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
// query for workspace size
magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
-1, &liwork, -1, &info);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
&info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: %d off-diagonal elements of an didn't converge to zero",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
goto fail;
}
d_dims[0] = N;
if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (params->compute_v) {
*V = theano_try_copy(*V, A);
if (*V == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
}
res = 0;
fail:
if (w_data != NULL)
magma_free_pinned(w_data);
if (wA_data != NULL)
magma_free_pinned(wA_data);
if (work_data != NULL)
magma_free_pinned(work_data);
if (iwork_data != NULL)
magma_free_cpu(iwork_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
......@@ -5,7 +5,7 @@ setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
PARAMS_TYPE* params) {
PARAMS_TYPE *params) {
const size_t *dims;
magma_int_t N, ldwork, info;
magma_int_t *piv = NULL;
......@@ -20,12 +20,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1;
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
......@@ -93,7 +92,6 @@ fail:
magma_free(piv);
if (dwork != NULL)
gpudata_release(dwork);
magma_finalize();
cuda_exit(params->context->ctx);
return res;
}
#section kernels
#kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size ix = index / ncols;
const ga_size iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code
static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
size_t size) {
PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
src_view->ga.dimensions[dim] = size;
GpuArray_fix_flags(&src_view->ga);
return pygpu_copy(src_view, GA_C_ORDER);
}
#section support_code_struct
int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
PyGpuArrayObject **R,
PyGpuArrayObject **Q, // may be NULL
PARAMS_TYPE* params) {
PyGpuArrayObject *A = NULL;
magma_int_t M, N, K, nb, ldwork;
size_t n2;
float *tau_data = NULL;
gpudata *work_data = NULL;
int res = -1, info;
A = A_;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaQR: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaQR: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
goto fail;
}
// magma matrix qr
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
nb = magma_get_sgeqrf_nb(M, N);
ldwork = (2 * K + magma_roundup(N, 32)) * nb;
work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (work_data == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
// compute R
magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
goto fail;
}
*R = pygpu_narrow(A, 0, K);
if (*R == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
n2 = K * N;
res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
if (params->complete) {
// compute Q
Py_XDECREF(A);
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
return -1;
}
magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
goto fail;
}
magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, nb, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
goto fail;
}
*Q = pygpu_narrow(A, 1, K);
if (*Q == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
}
res = 0;
fail:
if (tau_data != NULL)
magma_free_pinned(tau_data);
if (work_data != NULL)
gpudata_release(work_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
......@@ -8,7 +8,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
PyGpuArrayObject **S,
PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, // may be NULL
PARAMS_TYPE* params) {
PARAMS_TYPE *params) {
bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
......@@ -21,22 +21,21 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type");
"GpuMagmaSVD: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1;
"GpuMagmaSVD: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error");
"GpuMagmaSVD: matrix rank error");
goto fail;
}
......@@ -44,7 +43,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
// reverse dimensions because MAGMA expects column-major matrices:
M = PyGpuArray_DIM(A, 1);
N = PyGpuArray_DIM(A, 0);
K = std::min(M, N);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError,
......@@ -166,7 +165,6 @@ fail:
magma_free_pinned(work);
if (iwork != NULL)
magma_free_cpu(iwork);
magma_finalize();
cuda_exit(params->context->ctx);
return res;
}
......@@ -74,7 +74,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, gpu_svd)
cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
GpuMagmaCholesky, gpu_qr, GpuMagmaEigh)
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -2112,9 +2113,6 @@ def local_inplace_gpu_solve(node):
# Cholesky decomposition
@register_opt('fast_compile')
@op_lifter([slinalg.Cholesky])
@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available:
return
......@@ -2125,19 +2123,99 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
return op(inputs[0].astype('float32')).astype('float16')
return op
matrix_ops_db = LocalGroupDB()
matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
matrix_ops_db2.__name__ = "matrix_ops_db2"
# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
# matrices of size 1000). Thus, cusolver is prioritized during graph
# optimizations. To explicitly use magma, you should disable cusolver using
# `optimizer_excluding=cusolver` in Theano config.
lifter = op_lifter([slinalg.Cholesky])(local_gpu_cholesky)
matrix_ops_db.register("local_gpu_cholesky", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
position=0)
matrix_ops_db2.register("local_gpu_cholesky",
local_optimizer([slinalg.Cholesky])(local_gpu_cholesky),
'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
position=0)
register_opt('fast_compile', name='matrix_ops_db')(matrix_ops_db)
register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops_db2)
@register_inplace()
@local_optimizer([GpuCholesky], inplace=True)
def local_inplace_cholesky(node):
def local_inplace_gpu_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
return [node.op.clone_inplace()(*node.inputs)]
def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
lifter = op_lifter([slinalg.Cholesky])(local_gpu_magma_cholesky)
matrix_ops_db.register("local_gpu_magma_cholesky", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'magma',
position=1)
matrix_ops_db2.register("local_gpu_magma_cholesky",
local_optimizer([slinalg.Cholesky])(local_gpu_magma_cholesky),
'gpuarray', 'fast_compile', 'fast_run', 'magma',
position=1)
@register_inplace()
@local_optimizer([GpuMagmaCholesky], inplace=True)
def local_inplace_gpu_magma_cholesky(node):
if isinstance(node.op, GpuMagmaCholesky) and not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)]
# QR decomposition
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.QRFull])
@register_opt2([theano.tensor.nlinalg.QRFull], 'magma', 'fast_compile')
def local_gpu_magma_qr(op, context_name, inputs, outputs):
if not config.magma.enabled or op.mode != 'reduced':
return
if inputs[0].dtype not in ['float16', 'float32']:
return
x = inputs[0]
if inputs[0].dtype == 'float16':
x = inputs[0].astype('float32')
out = gpu_qr(x, complete=True)
if inputs[0].dtype == 'float16':
return [o.astype('float16') for o in out]
return out
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.QRIncomplete])
@register_opt2([theano.tensor.nlinalg.QRIncomplete], 'magma', 'fast_compile')
def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
x = inputs[0]
if inputs[0].dtype == 'float16':
x = inputs[0].astype('float32')
out = gpu_qr(x, complete=False)
if inputs[0].dtype == 'float16':
return [out.astype('float16')]
return out
# Matrix inverse
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.MatrixInverse])
@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
......@@ -2150,16 +2228,31 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
@register_inplace()
@local_optimizer([GpuMagmaMatrixInverse])
def local_inplace_matrix_inverse_inplace(node):
if isinstance(node.op, GpuMagmaMatrixInverse):
if not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)]
def local_inplace_gpu_magma_matrix_inverse(node):
if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)]
# Eigen decomposition of a symmetric matrix
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.Eigh])
@register_opt2([theano.tensor.nlinalg.Eigh], 'magma', 'fast_compile')
def local_gpu_magma_eigh(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaEigh(UPLO=op.UPLO, compute_v=True)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
# Singular Value Decomposition
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.SVD])
@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
def local_gpu_svd(op, context_name, inputs, outputs):
def local_gpu_magma_svd(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
......
......@@ -7,10 +7,14 @@ from numpy.linalg.linalg import LinAlgError
import theano
from theano import config
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse,
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
GpuMagmaEigh, GpuMagmaMatrixInverse,
GpuMagmaQR, GpuMagmaSVD,
cusolver_available, gpu_matrix_inverse,
gpu_solve, gpu_svd)
from theano.tensor.nlinalg import matrix_inverse
gpu_solve, gpu_svd, gpu_qr)
from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
QRIncomplete, eigh, matrix_inverse, qr)
from theano.tensor.slinalg import Cholesky, cholesky
from theano.tests import unittest_tools as utt
from .. import gpuarray_shared_constructor
......@@ -132,7 +136,8 @@ class TestGpuCholesky(unittest.TestCase):
A = theano.tensor.matrix("A", dtype="float32")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
return theano.function([A], chol_A, accept_inplace=inplace,
mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
......@@ -144,6 +149,12 @@ class TestGpuCholesky(unittest.TestCase):
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
A = theano.tensor.matrix("A", dtype="float32")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuCholesky)
for node in fn.maker.fgraph.toposort()])
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float32")
......@@ -207,6 +218,20 @@ class TestMagma(unittest.TestCase):
if not config.magma.enabled:
self.skipTest('Magma is not enabled, skipping test')
def test_magma_opt_float16(self):
ops_to_gpu = [(MatrixInverse(), GpuMagmaMatrixInverse),
(SVD(), GpuMagmaSVD),
(QRFull(mode='reduced'), GpuMagmaQR),
(QRIncomplete(mode='r'), GpuMagmaQR),
# TODO: add support for float16 to Eigh numpy
# (Eigh(), GpuMagmaEigh),
(Cholesky(), GpuMagmaCholesky)]
for op, gpu_op in ops_to_gpu:
A = theano.tensor.matrix("A", dtype="float16")
fn = theano.function([A], op(A), mode=mode_with_gpu.excluding('cusolver'))
assert any([isinstance(node.op, gpu_op)
for node in fn.maker.fgraph.toposort()])
def test_gpu_matrix_inverse(self):
A = theano.tensor.fmatrix("A")
......@@ -216,7 +241,7 @@ class TestMagma(unittest.TestCase):
# Copied from theano.tensor.tests.test_basic.rand.
A_val = test_rng.rand(N, N).astype('float32') * 2 - 1
A_val_inv = fn(A_val)
utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=5e-3)
utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=1e-2)
def test_gpu_matrix_inverse_inplace(self):
N = 1000
......@@ -296,3 +321,132 @@ class TestMagma(unittest.TestCase):
A_val = rand(100, 50).astype('float32')
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def run_gpu_cholesky(self, A_val, lower=True):
A = theano.tensor.fmatrix("A")
f = theano.function([A], GpuMagmaCholesky(lower=lower)(A),
mode=mode_with_gpu.excluding('cusolver'))
return f(A_val)
def rand_symmetric(self, N):
A = rand(N, N).astype('float32')
# ensure that eigenvalues are not too small which sometimes results in
# magma cholesky failure due to gpu limited numerical precision
D, W = np.linalg.eigh(A)
D[D < 1] = 1
V_m = np.zeros_like(A)
np.fill_diagonal(V_m, D)
return np.dot(np.dot(W.T, V_m), W)
def check_cholesky(self, N, lower=True, rtol=None, atol=None):
A = self.rand_symmetric(N)
L = self.run_gpu_cholesky(A, lower=lower)
if not lower:
L = L.T
utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
def test_gpu_cholesky(self):
self.check_cholesky(1000, atol=1e-3)
self.check_cholesky(1000, lower=False, atol=1e-3)
def test_gpu_cholesky_opt(self):
A = theano.tensor.matrix("A", dtype="float32")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu.excluding('cusolver'))
assert any([isinstance(node.op, GpuMagmaCholesky)
for node in fn.maker.fgraph.toposort()])
def test_gpu_cholesky_inplace(self):
A = self.rand_symmetric(1000)
A_gpu = gpuarray_shared_constructor(A)
A_copy = A_gpu.get_value()
fn = theano.function([], GpuMagmaCholesky(inplace=True)(A_gpu),
mode=mode_with_gpu, accept_inplace=True)
fn()
L = A_gpu.get_value()
utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
def test_gpu_cholesky_inplace_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], GpuMagmaCholesky()(A), mode=mode_with_gpu)
assert any([
node.op.inplace
for node in fn.maker.fgraph.toposort() if
isinstance(node.op, GpuMagmaCholesky)
])
def run_gpu_qr(self, A_val, complete=True):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], gpu_qr(A, complete=complete),
mode=mode_with_gpu)
return fn(A_val)
def check_gpu_qr(self, M, N, complete=True, rtol=None, atol=None):
A = rand(M, N).astype('float32')
if complete:
Q_gpu, R_gpu = self.run_gpu_qr(A, complete=complete)
else:
R_gpu = self.run_gpu_qr(A, complete=complete)
Q_np, R_np = np.linalg.qr(A, mode='reduced')
utt.assert_allclose(R_np, R_gpu, rtol=rtol, atol=atol)
if complete:
utt.assert_allclose(Q_np, Q_gpu, rtol=rtol, atol=atol)
def test_gpu_qr(self):
self.check_gpu_qr(1000, 500, atol=1e-3)
self.check_gpu_qr(1000, 500, complete=False, atol=1e-3)
self.check_gpu_qr(500, 1000, atol=1e-3)
self.check_gpu_qr(500, 1000, complete=False, atol=1e-3)
def test_gpu_qr_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], qr(A), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaQR) and node.op.complete
for node in fn.maker.fgraph.toposort()
])
def test_gpu_qr_incomplete_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], qr(A, mode='r'), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaQR) and not node.op.complete
for node in fn.maker.fgraph.toposort()
])
def run_gpu_eigh(self, A_val, UPLO='L', compute_v=True):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], GpuMagmaEigh(UPLO=UPLO, compute_v=compute_v)(A),
mode=mode_with_gpu)
return fn(A_val)
def check_gpu_eigh(self, N, UPLO='L', compute_v=True, rtol=None, atol=None):
A = rand(N, N).astype('float32')
A = np.dot(A.T, A)
d_np, v_np = np.linalg.eigh(A, UPLO=UPLO)
if compute_v:
d_gpu, v_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=compute_v)
else:
d_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=False)
utt.assert_allclose(d_np, d_gpu, rtol=rtol, atol=atol)
if compute_v:
utt.assert_allclose(
np.eye(N), np.dot(v_gpu, v_gpu.T), rtol=rtol, atol=atol)
D_m = np.zeros_like(A)
np.fill_diagonal(D_m, d_gpu)
utt.assert_allclose(
A, np.dot(np.dot(v_gpu, D_m), v_gpu.T), rtol=rtol, atol=atol)
def test_gpu_eigh(self):
self.check_gpu_eigh(1000, UPLO='L', atol=1e-3)
self.check_gpu_eigh(1000, UPLO='U', atol=1e-3)
self.check_gpu_eigh(1000, UPLO='L', compute_v=False, atol=1e-3)
self.check_gpu_eigh(1000, UPLO='U', compute_v=False, atol=1e-3)
def test_gpu_eigh_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], eigh(A), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaEigh)
for node in fn.maker.fgraph.toposort()
])
......@@ -539,7 +539,7 @@ class QRIncomplete(Op):
Incomplete QR Decomposition.
Computes the QR decomposition of a matrix.
Factor the matrix a as qr and return a single matrix.
Factor the matrix a as qr and return a single matrix R.
"""
......@@ -552,15 +552,14 @@ class QRIncomplete(Op):
def make_node(self, x):
x = as_tensor_variable(x)
assert x.ndim == 2, "The input of qr function should be a matrix."
q = theano.tensor.matrix(dtype=x.dtype)
return Apply(self, [x], [q])
r = theano.tensor.matrix(dtype=x.dtype)
return Apply(self, [x], [r])
def perform(self, node, inputs, outputs):
(x,) = inputs
(q,) = outputs
(r,) = outputs
assert x.ndim == 2, "The input of qr function should be a matrix."
q[0] = self._numop(x,
self.mode)
r[0] = self._numop(x, self.mode)
def qr(a, mode="reduced"):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论