提交 1b46af51 authored 作者: notoraptor's avatar notoraptor

Wrap op params for theano.gpuarray.linalg.GpuMagmaSVD.

上级 1d6b79f6
......@@ -9,7 +9,8 @@ from numpy.linalg.linalg import LinAlgError
import theano
from theano import Op, config, tensor
from theano.gof import COp
from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType
from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
......@@ -350,9 +351,19 @@ def gpu_cholesky(A, lower=True):
class GpuMagmaSVD(COp):
"""Computes the svd of a matrix :math:`A` using magma library.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
to get them in expected order ``U, S, VT``.
"""
__props__ = ('full_matrices', 'compute_uv')
params_type = gpu_context_type
_cop_num_inputs = 1
_cop_num_outputs = 3
check_input = False
params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices
......@@ -385,25 +396,19 @@ class GpuMagmaSVD(COp):
assert A.dtype == 'float32'
if self.compute_uv:
return theano.Apply(self, [A],
[A.type(),
GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type()])
# return S, U, VT
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type(),
A.type()])
else:
return theano.Apply(self, [A],
# return only S
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
params = []
if self.compute_uv:
params.append(('COMPUTE_UV', '1'))
if self.full_matrices:
params.append(('FULL_MATRICES', '1'))
return params
return self.params_type.get_params(self, context=node.inputs[0].type.context)
def infer_shape(self, node, shapes):
x_shape, = shapes
......@@ -413,7 +418,7 @@ class GpuMagmaSVD(COp):
if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N)
return [u_shape, s_shape, vt_shape]
return [s_shape, u_shape, vt_shape]
else:
return [s_shape]
......@@ -438,7 +443,11 @@ def gpu_svd(a, full_matrices=1, compute_uv=1):
U, V, D : matrices
"""
return GpuMagmaSVD(full_matrices, compute_uv)(a)
out = GpuMagmaSVD(full_matrices, compute_uv)(a)
if compute_uv:
S, U, VT = out
out = [U, S, VT]
return out
class GpuMagmaMatrixInverse(COp):
......
......@@ -5,14 +5,11 @@ setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
#ifdef COMPUTE_UV
PyGpuArrayObject **U,
#endif
PyGpuArrayObject **S,
#ifdef COMPUTE_UV
PyGpuArrayObject **VT,
#endif
PyGpuContextObject *c) {
PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, // may be NULL
PARAMS_TYPE* params) {
bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
magma_vec_t jobz;
......@@ -29,7 +26,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
cuda_enter(params->context->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
......@@ -63,32 +60,32 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
goto fail;
}
#ifdef COMPUTE_UV
#ifdef FULL_MATRICES
jobz = MagmaAllVec;
#else
jobz = MagmaSomeVec;
#endif
M_U = (jobz == MagmaAllVec ? M : K);
N_VT = (jobz == MagmaAllVec ? N : K);
ldu = M;
ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
if (compute_uv) {
if (params->full_matrices) {
jobz = MagmaAllVec;
} else {
jobz = MagmaSomeVec;
}
M_U = (jobz == MagmaAllVec ? M : K);
N_VT = (jobz == MagmaAllVec ? N : K);
ldu = M;
ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
} else {
jobz = MagmaNoVec;
ldu = M;
ldv = N;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
#else
jobz = MagmaNoVec;
ldu = M;
ldv = N;
#endif
// query for workspace size
magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
......@@ -124,7 +121,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
}
s_dims[0] = K;
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
......@@ -132,29 +129,29 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice);
#ifdef COMPUTE_UV
u_dims[0] = N; u_dims[1] = N_VT;
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
cudaMemcpyDeviceToDevice);
vt_dims[0] = M_U; vt_dims[1] = M;
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
if (compute_uv) {
u_dims[0] = N; u_dims[1] = N_VT;
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
cudaMemcpyDeviceToDevice);
vt_dims[0] = M_U; vt_dims[1] = M;
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
#endif
res = 0;
fail:
if (a_data != NULL)
......@@ -170,6 +167,6 @@ fail:
if (iwork != NULL)
magma_free_cpu(iwork);
magma_finalize();
cuda_exit(c->ctx);
cuda_exit(params->context->ctx);
return res;
}
......@@ -73,7 +73,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
cusolver_available, GpuMagmaMatrixInverse, gpu_svd)
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -2149,11 +2149,16 @@ def local_gpu_svd(op, context_name, inputs, outputs):
return
if inputs[0].dtype not in ['float16', 'float32']:
return
op = GpuMagmaSVD(full_matrices=op.full_matrices,
compute_uv=op.compute_uv)
x = inputs[0]
if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16')
return op
x = inputs[0].astype('float32')
out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices)
if inputs[0].dtype == 'float16':
if op.compute_uv:
out = [o.astype('float16') for o in out]
else:
out = [out.astype('float16')]
return out
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论