提交 e76d05d6 authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #5914 from aam-at/magma_ops

Magma gpu QR/Cholesky/Eigh
...@@ -13,7 +13,8 @@ from theano.scalar import bool as bool_t ...@@ -13,7 +13,8 @@ from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType from theano.gpuarray import GpuArrayType
from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name from .basic_ops import (CGpuKernelBase, as_gpuarray_variable, gpu_contiguous,
infer_context_name)
from .type import gpu_context_type from .type import gpu_context_type
try: try:
...@@ -257,6 +258,9 @@ class GpuCholesky(Op): ...@@ -257,6 +258,9 @@ class GpuCholesky(Op):
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
super(GpuCholesky, self).__init__() super(GpuCholesky, self).__init__()
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, inp): def make_node(self, inp):
if not cusolver_available: if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and ' raise RuntimeError('CUSOLVER is not available and '
...@@ -349,7 +353,39 @@ def gpu_cholesky(A, lower=True): ...@@ -349,7 +353,39 @@ def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A) return GpuCholesky(lower)(A)
class GpuMagmaSVD(COp): # TODO: add support for float64
class GpuMagmaBase(COp):
"""Base class for magma related operations. Add the necessary headers,
libraries and optionally the location of headers and library.
"""
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def prepare_node(self, node, storage_map, compute_map, impl):
from skcuda.magma import magma_init
ctx = node.inputs[0].type.context
if not getattr(ctx, 'is_magma_initialized', False):
with ctx:
magma_init()
ctx.is_magma_initialized = True
class GpuMagmaSVD(GpuMagmaBase):
"""Computes the svd of a matrix :math:`A` using magma library. """Computes the svd of a matrix :math:`A` using magma library.
.. warning:: .. warning::
...@@ -370,30 +406,14 @@ class GpuMagmaSVD(COp): ...@@ -370,30 +406,14 @@ class GpuMagmaSVD(COp):
self.compute_uv = compute_uv self.compute_uv = compute_uv
COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)') COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def make_node(self, A): def make_node(self, A):
ctx_name = infer_context_name(A) ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name) A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2: if A.ndim != 2:
raise LinAlgError("Matrix rank error") raise LinAlgError("Matrix rank error")
assert A.dtype == 'float32' if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
if self.compute_uv: if self.compute_uv:
return theano.Apply(self, [A], return theano.Apply(self, [A],
# return S, U, VT # return S, U, VT
...@@ -408,6 +428,7 @@ class GpuMagmaSVD(COp): ...@@ -408,6 +428,7 @@ class GpuMagmaSVD(COp):
context_name=ctx_name)()]) context_name=ctx_name)()])
def prepare_node(self, node, storage_map, compute_map, impl): def prepare_node(self, node, storage_map, compute_map, impl):
super(GpuMagmaSVD, self).prepare_node(node, storage_map, compute_map, impl)
# Check node to prevent eventual errors with old pickled nodes. # Check node to prevent eventual errors with old pickled nodes.
if self.compute_uv: if self.compute_uv:
A, B, C = node.outputs A, B, C = node.outputs
...@@ -459,7 +480,7 @@ def gpu_svd(a, full_matrices=1, compute_uv=1): ...@@ -459,7 +480,7 @@ def gpu_svd(a, full_matrices=1, compute_uv=1):
return out return out
class GpuMagmaMatrixInverse(COp): class GpuMagmaMatrixInverse(GpuMagmaBase):
"""Computes the inverse of a matrix :math:`A` using magma library. """Computes the inverse of a matrix :math:`A` using magma library.
""" """
__props__ = ('inplace', ) __props__ = ('inplace', )
...@@ -472,34 +493,18 @@ class GpuMagmaMatrixInverse(COp): ...@@ -472,34 +493,18 @@ class GpuMagmaMatrixInverse(COp):
if self.inplace: if self.inplace:
self.destroy_map = {0: [0]} self.destroy_map = {0: [0]}
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def clone_inplace(self): def clone_inplace(self):
return self.__class__(inplace=True) return self.__class__(inplace=True)
def make_node(self, x): def make_node(self, A):
ctx_name = infer_context_name(x) ctx_name = infer_context_name(A)
x = as_gpuarray_variable(x, ctx_name) A = as_gpuarray_variable(A, ctx_name)
assert x.dtype == 'float32' A = gpu_contiguous(A)
if x.ndim != 2: if A.ndim != 2:
raise LinAlgError("Matrix rank error") raise LinAlgError("Matrix rank error")
return theano.Apply(self, [x], [x.type()]) if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
return theano.Apply(self, [A], [A.type()])
def get_params(self, node): def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context) return self.params_type.get_params(self, context=node.inputs[0].type.context)
...@@ -518,3 +523,153 @@ def gpu_matrix_inverse(a): ...@@ -518,3 +523,153 @@ def gpu_matrix_inverse(a):
""" """
return GpuMagmaMatrixInverse()(a) return GpuMagmaMatrixInverse()(a)
class GpuMagmaCholesky(GpuMagmaBase, CGpuKernelBase):
"""Computes the cholesky decomposition of a matrix :math:`A` using magma
library.
"""
__props__ = ('lower', 'inplace')
check_input = False
params_type = ParamsType(lower=bool_t, inplace=bool_t, context=gpu_context_type)
def __init__(self, lower=True, inplace=False):
self.lower = lower
COp.__init__(self, ['magma_cholesky.c'], 'APPLY_SPECIFIC(magma_cholesky)')
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def clone_inplace(self):
return self.__class__(lower=self.lower, inplace=True)
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
return theano.Apply(self, [A], [A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
return [shapes[0]]
class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
"""Computes the qr decomposition of a matrix :math:`A` using magma
library.
Parameters
----------
complete : If `False`, returns only r.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``R, Q``. Use :func:`theano.gpuarray.linalg.gpu_qr`
to get them in expected order ``Q, R``.
"""
__props__ = ('complete', )
_cop_num_inputs = 1
_cop_num_outputs = 2
check_input = False
params_type = ParamsType(complete=bool_t, context=gpu_context_type)
def __init__(self, complete=True):
self.complete = complete
COp.__init__(self, ['magma_qr.c'], 'APPLY_SPECIFIC(magma_qr)')
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
if self.complete:
return theano.Apply(self, [A],
# return R, Q
[A.type(), A.type()])
else:
return theano.Apply(self, [A],
# return R
[A.type()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def gpu_qr(a, complete=True):
"""
This function performs the QR on GPU.
Parameters
----------
complete : bool, optional
If `False`, returns only r.
Returns
-------
Q, R : matrices
"""
out = GpuMagmaQR(complete)(a)
if complete:
R, Q = out
out = [Q, R]
return out
class GpuMagmaEigh(GpuMagmaBase):
"""Computes the eigen decomposition of a symmetric matrix :math:`A` using magma
library.
Parameters
----------
UPLO : Specifies whether the calculation is done with the lower triangular
part of matrix (`L`, default) or the upper triangular part (`U`).
compute_v : If `True`, computes eigenvalues and eigenvectors (`True`,
default). If `False`, computes only eigenvalues of matrix.
"""
__props__ = ('lower', 'compute_v')
_cop_num_inputs = 1
_cop_num_outputs = 2
check_input = False
params_type = ParamsType(lower=bool_t, compute_v=bool_t,
context=gpu_context_type)
def __init__(self, UPLO='L', compute_v=True):
assert UPLO in ['L', 'U']
self.lower = UPLO == 'L'
self.compute_v = compute_v
COp.__init__(self, ['magma_eigh.c'], 'APPLY_SPECIFIC(magma_eigh)')
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
A = gpu_contiguous(A)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if A.dtype != 'float32':
raise TypeError("only `float32` is supported for now")
if self.compute_v:
return theano.Apply(self, [A],
# return D, V
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type()])
else:
return theano.Apply(self, [A],
# return D
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()])
def get_params(self, node):
return self.params_type.get_params(self, context=node.inputs[0].type.context)
#section kernels
#kernel tril_kernel : size, size, size, *:
KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix < iy) {
a[index] = 0.0;
}
}
}
#kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
unsigned int ix = index / ncols;
unsigned int iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
PARAMS_TYPE *params) {
const size_t *dims;
size_t N, n2;
magma_uplo_t ul;
int res = -1, info;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaCholesky: unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaCholesky: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
goto fail;
}
dims = PyGpuArray_DIMS(A);
if (dims[0] != dims[1]) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
goto fail;
}
if (params->inplace) {
Py_XDECREF(*L);
*L = A;
Py_INCREF(*L);
} else {
*L = theano_try_copy(*L, A);
if (*L == NULL) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaCholesky: failed to allocate memory for the output");
goto fail;
}
}
// magma matrix cholesky
N = dims[0];
n2 = N * N;
// Magma requires column-major order for the matrix A. Instead of changing
// matrix order which requires copying data, we can compute cholesky
// decomposition where we change parameters lower to upper and upper to
// lower.
if (params->lower) {
ul = MagmaUpper;
}
else {
ul = MagmaLower;
}
magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
if (info > 0) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
"order %d is not positive definite",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
-info);
goto fail;
}
if (params->lower) {
res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
GpuKernel_error(&k_tril_kernel, res));
goto fail;
}
} else {
res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
}
res = 0;
fail:
cuda_exit(params->context->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
PyGpuArrayObject **D,
PyGpuArrayObject **V, // may be NULL
PARAMS_TYPE *params) {
PyGpuArrayObject *A = NULL;
magma_int_t N, liwork, *iwork_data = NULL;
size_t d_dims[1], v_dims[2];
magma_uplo_t uplo;
magma_vec_t jobz;
float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
int res = -1, info;
if (A_->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaEigh: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A_) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix rank error");
goto fail;
}
if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaEigh: matrix is not square");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to change to column-major order");
return -1;
}
// magma matrix eigen decomposition of a symmetric matrix
N = PyGpuArray_DIM(A, 0);
if (params->lower) {
uplo = MagmaLower;
} else {
uplo = MagmaUpper;
}
if (params->compute_v) {
jobz = MagmaVec;
} else {
jobz = MagmaNoVec;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
// query for workspace size
magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
-1, &liwork, -1, &info);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate working memory");
goto fail;
}
magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
&info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: %d off-diagonal elements of an didn't converge to zero",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
goto fail;
}
d_dims[0] = N;
if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (params->compute_v) {
*V = theano_try_copy(*V, A);
if (*V == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaEigh: failed to allocate memory for the output");
goto fail;
}
}
res = 0;
fail:
if (w_data != NULL)
magma_free_pinned(w_data);
if (wA_data != NULL)
magma_free_pinned(wA_data);
if (work_data != NULL)
magma_free_pinned(work_data);
if (iwork_data != NULL)
magma_free_cpu(iwork_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
...@@ -5,7 +5,7 @@ setup_ext_cuda(); ...@@ -5,7 +5,7 @@ setup_ext_cuda();
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv, int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
PARAMS_TYPE* params) { PARAMS_TYPE *params) {
const size_t *dims; const size_t *dims;
magma_int_t N, ldwork, info; magma_int_t N, ldwork, info;
magma_int_t *piv = NULL; magma_int_t *piv = NULL;
...@@ -20,12 +20,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv, ...@@ -20,12 +20,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
// This is early to match the exit() in the fail label. // This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx); cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous"); "GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1; goto fail;
} }
if (PyGpuArray_NDIM(A) != 2) { if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
...@@ -93,7 +92,6 @@ fail: ...@@ -93,7 +92,6 @@ fail:
magma_free(piv); magma_free(piv);
if (dwork != NULL) if (dwork != NULL)
gpudata_release(dwork); gpudata_release(dwork);
magma_finalize();
cuda_exit(params->context->ctx); cuda_exit(params->context->ctx);
return res; return res;
} }
#section kernels
#kernel triu_kernel : size, size, size, *:
KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
// grid stride looping
for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
index += LDIM_0 * GDIM_0) {
const ga_size ix = index / ncols;
const ga_size iy = index % ncols;
if (ix > iy) {
a[index] = 0.0;
}
}
}
#section init_code
setup_ext_cuda();
#section support_code
static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
size_t size) {
PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
src_view->ga.dimensions[dim] = size;
GpuArray_fix_flags(&src_view->ga);
return pygpu_copy(src_view, GA_C_ORDER);
}
#section support_code_struct
int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
PyGpuArrayObject **R,
PyGpuArrayObject **Q, // may be NULL
PARAMS_TYPE* params) {
PyGpuArrayObject *A = NULL;
magma_int_t M, N, K, nb, ldwork;
size_t n2;
float *tau_data = NULL;
gpudata *work_data = NULL;
int res = -1, info;
A = A_;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaQR: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx);
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaQR: requires data to be C-contiguous");
goto fail;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
goto fail;
}
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
goto fail;
}
// magma matrix qr
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
nb = magma_get_sgeqrf_nb(M, N);
ldwork = (2 * K + magma_roundup(N, 32)) * nb;
work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (work_data == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to allocate working memory");
goto fail;
}
// compute R
magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
goto fail;
}
*R = pygpu_narrow(A, 0, K);
if (*R == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
n2 = K * N;
res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
if (res != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
GpuKernel_error(&k_triu_kernel, res));
goto fail;
}
if (params->complete) {
// compute Q
Py_XDECREF(A);
A = pygpu_copy(A_, GA_F_ORDER);
if (A == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaQR: failed to change to column-major order");
return -1;
}
magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
goto fail;
}
magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
*(float **)work_data, nb, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
goto fail;
}
*Q = pygpu_narrow(A, 1, K);
if (*Q == NULL) {
PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
goto fail;
}
}
res = 0;
fail:
if (tau_data != NULL)
magma_free_pinned(tau_data);
if (work_data != NULL)
gpudata_release(work_data);
Py_XDECREF(A);
cuda_exit(params->context->ctx);
return res;
}
...@@ -8,7 +8,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -8,7 +8,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
PyGpuArrayObject **S, PyGpuArrayObject **S,
PyGpuArrayObject **U, // may be NULL PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, // may be NULL PyGpuArrayObject **VT, // may be NULL
PARAMS_TYPE* params) { PARAMS_TYPE *params) {
bool compute_uv = (U != NULL); bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1]; magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info; magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
...@@ -21,22 +21,21 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -21,22 +21,21 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
if (A->ga.typecode != GA_FLOAT) { if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type"); "GpuMagmaSVD: Unsupported data type");
return -1; return -1;
} }
// This is early to match the exit() in the fail label. // This is early to match the exit() in the fail label.
cuda_enter(params->context->ctx); cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous"); "GpuMagmaSVD: requires data to be C-contiguous");
return 1; goto fail;
} }
if (PyGpuArray_NDIM(A) != 2) { if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error"); "GpuMagmaSVD: matrix rank error");
goto fail; goto fail;
} }
...@@ -44,7 +43,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -44,7 +43,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
// reverse dimensions because MAGMA expects column-major matrices: // reverse dimensions because MAGMA expects column-major matrices:
M = PyGpuArray_DIM(A, 1); M = PyGpuArray_DIM(A, 1);
N = PyGpuArray_DIM(A, 0); N = PyGpuArray_DIM(A, 0);
K = std::min(M, N); K = M < N ? M : N;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) { if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
...@@ -166,7 +165,6 @@ fail: ...@@ -166,7 +165,6 @@ fail:
magma_free_pinned(work); magma_free_pinned(work);
if (iwork != NULL) if (iwork != NULL)
magma_free_cpu(iwork); magma_free_cpu(iwork);
magma_finalize();
cuda_exit(params->context->ctx); cuda_exit(params->context->ctx);
return res; return res;
} }
...@@ -74,7 +74,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -74,7 +74,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, gpu_svd) cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
GpuMagmaCholesky, gpu_qr, GpuMagmaEigh)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -2112,9 +2113,6 @@ def local_inplace_gpu_solve(node): ...@@ -2112,9 +2113,6 @@ def local_inplace_gpu_solve(node):
# Cholesky decomposition # Cholesky decomposition
@register_opt('fast_compile')
@op_lifter([slinalg.Cholesky])
@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
def local_gpu_cholesky(op, context_name, inputs, outputs): def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available: if not cusolver_available:
return return
...@@ -2125,19 +2123,99 @@ def local_gpu_cholesky(op, context_name, inputs, outputs): ...@@ -2125,19 +2123,99 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
return op(inputs[0].astype('float32')).astype('float16') return op(inputs[0].astype('float32')).astype('float16')
return op return op
matrix_ops_db = LocalGroupDB()
matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
matrix_ops_db2.__name__ = "matrix_ops_db2"
# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
# matrices of size 1000). Thus, cusolver is prioritized during graph
# optimizations. To explicitly use magma, you should disable cusolver using
# `optimizer_excluding=cusolver` in Theano config.
lifter = op_lifter([slinalg.Cholesky])(local_gpu_cholesky)
matrix_ops_db.register("local_gpu_cholesky", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
position=0)
matrix_ops_db2.register("local_gpu_cholesky",
local_optimizer([slinalg.Cholesky])(local_gpu_cholesky),
'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
position=0)
register_opt('fast_compile', name='matrix_ops_db')(matrix_ops_db)
register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops_db2)
@register_inplace() @register_inplace()
@local_optimizer([GpuCholesky], inplace=True) @local_optimizer([GpuCholesky], inplace=True)
def local_inplace_cholesky(node): def local_inplace_gpu_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace: if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)] return [node.op.clone_inplace()(*node.inputs)]
def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
lifter = op_lifter([slinalg.Cholesky])(local_gpu_magma_cholesky)
matrix_ops_db.register("local_gpu_magma_cholesky", lifter,
'gpuarray', 'fast_compile', 'fast_run', 'magma',
position=1)
matrix_ops_db2.register("local_gpu_magma_cholesky",
local_optimizer([slinalg.Cholesky])(local_gpu_magma_cholesky),
'gpuarray', 'fast_compile', 'fast_run', 'magma',
position=1)
@register_inplace()
@local_optimizer([GpuMagmaCholesky], inplace=True)
def local_inplace_gpu_magma_cholesky(node):
if isinstance(node.op, GpuMagmaCholesky) and not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)]
# QR decomposition
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.QRFull])
@register_opt2([theano.tensor.nlinalg.QRFull], 'magma', 'fast_compile')
def local_gpu_magma_qr(op, context_name, inputs, outputs):
if not config.magma.enabled or op.mode != 'reduced':
return
if inputs[0].dtype not in ['float16', 'float32']:
return
x = inputs[0]
if inputs[0].dtype == 'float16':
x = inputs[0].astype('float32')
out = gpu_qr(x, complete=True)
if inputs[0].dtype == 'float16':
return [o.astype('float16') for o in out]
return out
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.QRIncomplete])
@register_opt2([theano.tensor.nlinalg.QRIncomplete], 'magma', 'fast_compile')
def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
x = inputs[0]
if inputs[0].dtype == 'float16':
x = inputs[0].astype('float32')
out = gpu_qr(x, complete=False)
if inputs[0].dtype == 'float16':
return [out.astype('float16')]
return out
# Matrix inverse
@register_opt('magma', 'fast_compile') @register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.MatrixInverse]) @op_lifter([nlinalg.MatrixInverse])
@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile') @register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
def local_gpu_matrix_inverse(op, context_name, inputs, outputs): def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
if not config.magma.enabled: if not config.magma.enabled:
return return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32']:
...@@ -2150,16 +2228,31 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs): ...@@ -2150,16 +2228,31 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
@register_inplace() @register_inplace()
@local_optimizer([GpuMagmaMatrixInverse]) @local_optimizer([GpuMagmaMatrixInverse])
def local_inplace_matrix_inverse_inplace(node): def local_inplace_gpu_magma_matrix_inverse(node):
if isinstance(node.op, GpuMagmaMatrixInverse): if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
if not node.op.inplace: return [node.op.clone_inplace()(*node.inputs)]
return [node.op.clone_inplace()(*node.inputs)]
# Eigen decomposition of a symmetric matrix
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.Eigh])
@register_opt2([theano.tensor.nlinalg.Eigh], 'magma', 'fast_compile')
def local_gpu_magma_eigh(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaEigh(UPLO=op.UPLO, compute_v=True)
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
# Singular Value Decomposition
@register_opt('magma', 'fast_compile') @register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.SVD]) @op_lifter([nlinalg.SVD])
@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile') @register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
def local_gpu_svd(op, context_name, inputs, outputs): def local_gpu_magma_svd(op, context_name, inputs, outputs):
if not config.magma.enabled: if not config.magma.enabled:
return return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32']:
......
...@@ -7,10 +7,14 @@ from numpy.linalg.linalg import LinAlgError ...@@ -7,10 +7,14 @@ from numpy.linalg.linalg import LinAlgError
import theano import theano
from theano import config from theano import config
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse, from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
GpuMagmaEigh, GpuMagmaMatrixInverse,
GpuMagmaQR, GpuMagmaSVD,
cusolver_available, gpu_matrix_inverse, cusolver_available, gpu_matrix_inverse,
gpu_solve, gpu_svd) gpu_solve, gpu_svd, gpu_qr)
from theano.tensor.nlinalg import matrix_inverse from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
QRIncomplete, eigh, matrix_inverse, qr)
from theano.tensor.slinalg import Cholesky, cholesky
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
...@@ -132,7 +136,8 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -132,7 +136,8 @@ class TestGpuCholesky(unittest.TestCase):
A = theano.tensor.matrix("A", dtype="float32") A = theano.tensor.matrix("A", dtype="float32")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace) cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A) chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu) return theano.function([A], chol_A, accept_inplace=inplace,
mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False): def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output. # Helper function to compare op output to np.cholesky output.
...@@ -144,6 +149,12 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -144,6 +149,12 @@ class TestGpuCholesky(unittest.TestCase):
chol_A_res = np.array(res) chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val) utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
A = theano.tensor.matrix("A", dtype="float32")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuCholesky)
for node in fn.maker.fgraph.toposort()])
def test_invalid_input_fail_non_square(self): def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input. # Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float32") A_val = np.random.normal(size=(3, 2)).astype("float32")
...@@ -207,6 +218,20 @@ class TestMagma(unittest.TestCase): ...@@ -207,6 +218,20 @@ class TestMagma(unittest.TestCase):
if not config.magma.enabled: if not config.magma.enabled:
self.skipTest('Magma is not enabled, skipping test') self.skipTest('Magma is not enabled, skipping test')
def test_magma_opt_float16(self):
ops_to_gpu = [(MatrixInverse(), GpuMagmaMatrixInverse),
(SVD(), GpuMagmaSVD),
(QRFull(mode='reduced'), GpuMagmaQR),
(QRIncomplete(mode='r'), GpuMagmaQR),
# TODO: add support for float16 to Eigh numpy
# (Eigh(), GpuMagmaEigh),
(Cholesky(), GpuMagmaCholesky)]
for op, gpu_op in ops_to_gpu:
A = theano.tensor.matrix("A", dtype="float16")
fn = theano.function([A], op(A), mode=mode_with_gpu.excluding('cusolver'))
assert any([isinstance(node.op, gpu_op)
for node in fn.maker.fgraph.toposort()])
def test_gpu_matrix_inverse(self): def test_gpu_matrix_inverse(self):
A = theano.tensor.fmatrix("A") A = theano.tensor.fmatrix("A")
...@@ -216,7 +241,7 @@ class TestMagma(unittest.TestCase): ...@@ -216,7 +241,7 @@ class TestMagma(unittest.TestCase):
# Copied from theano.tensor.tests.test_basic.rand. # Copied from theano.tensor.tests.test_basic.rand.
A_val = test_rng.rand(N, N).astype('float32') * 2 - 1 A_val = test_rng.rand(N, N).astype('float32') * 2 - 1
A_val_inv = fn(A_val) A_val_inv = fn(A_val)
utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=5e-3) utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=1e-2)
def test_gpu_matrix_inverse_inplace(self): def test_gpu_matrix_inverse_inplace(self):
N = 1000 N = 1000
...@@ -296,3 +321,132 @@ class TestMagma(unittest.TestCase): ...@@ -296,3 +321,132 @@ class TestMagma(unittest.TestCase):
A_val = rand(100, 50).astype('float32') A_val = rand(100, 50).astype('float32')
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val)) utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def run_gpu_cholesky(self, A_val, lower=True):
A = theano.tensor.fmatrix("A")
f = theano.function([A], GpuMagmaCholesky(lower=lower)(A),
mode=mode_with_gpu.excluding('cusolver'))
return f(A_val)
def rand_symmetric(self, N):
A = rand(N, N).astype('float32')
# ensure that eigenvalues are not too small which sometimes results in
# magma cholesky failure due to gpu limited numerical precision
D, W = np.linalg.eigh(A)
D[D < 1] = 1
V_m = np.zeros_like(A)
np.fill_diagonal(V_m, D)
return np.dot(np.dot(W.T, V_m), W)
def check_cholesky(self, N, lower=True, rtol=None, atol=None):
A = self.rand_symmetric(N)
L = self.run_gpu_cholesky(A, lower=lower)
if not lower:
L = L.T
utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
def test_gpu_cholesky(self):
self.check_cholesky(1000, atol=1e-3)
self.check_cholesky(1000, lower=False, atol=1e-3)
def test_gpu_cholesky_opt(self):
A = theano.tensor.matrix("A", dtype="float32")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu.excluding('cusolver'))
assert any([isinstance(node.op, GpuMagmaCholesky)
for node in fn.maker.fgraph.toposort()])
def test_gpu_cholesky_inplace(self):
A = self.rand_symmetric(1000)
A_gpu = gpuarray_shared_constructor(A)
A_copy = A_gpu.get_value()
fn = theano.function([], GpuMagmaCholesky(inplace=True)(A_gpu),
mode=mode_with_gpu, accept_inplace=True)
fn()
L = A_gpu.get_value()
utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
def test_gpu_cholesky_inplace_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], GpuMagmaCholesky()(A), mode=mode_with_gpu)
assert any([
node.op.inplace
for node in fn.maker.fgraph.toposort() if
isinstance(node.op, GpuMagmaCholesky)
])
def run_gpu_qr(self, A_val, complete=True):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], gpu_qr(A, complete=complete),
mode=mode_with_gpu)
return fn(A_val)
def check_gpu_qr(self, M, N, complete=True, rtol=None, atol=None):
A = rand(M, N).astype('float32')
if complete:
Q_gpu, R_gpu = self.run_gpu_qr(A, complete=complete)
else:
R_gpu = self.run_gpu_qr(A, complete=complete)
Q_np, R_np = np.linalg.qr(A, mode='reduced')
utt.assert_allclose(R_np, R_gpu, rtol=rtol, atol=atol)
if complete:
utt.assert_allclose(Q_np, Q_gpu, rtol=rtol, atol=atol)
def test_gpu_qr(self):
self.check_gpu_qr(1000, 500, atol=1e-3)
self.check_gpu_qr(1000, 500, complete=False, atol=1e-3)
self.check_gpu_qr(500, 1000, atol=1e-3)
self.check_gpu_qr(500, 1000, complete=False, atol=1e-3)
def test_gpu_qr_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], qr(A), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaQR) and node.op.complete
for node in fn.maker.fgraph.toposort()
])
def test_gpu_qr_incomplete_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], qr(A, mode='r'), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaQR) and not node.op.complete
for node in fn.maker.fgraph.toposort()
])
def run_gpu_eigh(self, A_val, UPLO='L', compute_v=True):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], GpuMagmaEigh(UPLO=UPLO, compute_v=compute_v)(A),
mode=mode_with_gpu)
return fn(A_val)
def check_gpu_eigh(self, N, UPLO='L', compute_v=True, rtol=None, atol=None):
A = rand(N, N).astype('float32')
A = np.dot(A.T, A)
d_np, v_np = np.linalg.eigh(A, UPLO=UPLO)
if compute_v:
d_gpu, v_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=compute_v)
else:
d_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=False)
utt.assert_allclose(d_np, d_gpu, rtol=rtol, atol=atol)
if compute_v:
utt.assert_allclose(
np.eye(N), np.dot(v_gpu, v_gpu.T), rtol=rtol, atol=atol)
D_m = np.zeros_like(A)
np.fill_diagonal(D_m, d_gpu)
utt.assert_allclose(
A, np.dot(np.dot(v_gpu, D_m), v_gpu.T), rtol=rtol, atol=atol)
def test_gpu_eigh(self):
self.check_gpu_eigh(1000, UPLO='L', atol=1e-3)
self.check_gpu_eigh(1000, UPLO='U', atol=1e-3)
self.check_gpu_eigh(1000, UPLO='L', compute_v=False, atol=1e-3)
self.check_gpu_eigh(1000, UPLO='U', compute_v=False, atol=1e-3)
def test_gpu_eigh_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], eigh(A), mode=mode_with_gpu)
assert any([
isinstance(node.op, GpuMagmaEigh)
for node in fn.maker.fgraph.toposort()
])
...@@ -539,7 +539,7 @@ class QRIncomplete(Op): ...@@ -539,7 +539,7 @@ class QRIncomplete(Op):
Incomplete QR Decomposition. Incomplete QR Decomposition.
Computes the QR decomposition of a matrix. Computes the QR decomposition of a matrix.
Factor the matrix a as qr and return a single matrix. Factor the matrix a as qr and return a single matrix R.
""" """
...@@ -552,15 +552,14 @@ class QRIncomplete(Op): ...@@ -552,15 +552,14 @@ class QRIncomplete(Op):
def make_node(self, x): def make_node(self, x):
x = as_tensor_variable(x) x = as_tensor_variable(x)
assert x.ndim == 2, "The input of qr function should be a matrix." assert x.ndim == 2, "The input of qr function should be a matrix."
q = theano.tensor.matrix(dtype=x.dtype) r = theano.tensor.matrix(dtype=x.dtype)
return Apply(self, [x], [q]) return Apply(self, [x], [r])
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
(x,) = inputs (x,) = inputs
(q,) = outputs (r,) = outputs
assert x.ndim == 2, "The input of qr function should be a matrix." assert x.ndim == 2, "The input of qr function should be a matrix."
q[0] = self._numop(x, r[0] = self._numop(x, self.mode)
self.mode)
def qr(a, mode="reduced"): def qr(a, mode="reduced"):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论