提交 1b46af51 authored 作者: notoraptor's avatar notoraptor

Wrap op params for theano.gpuarray.linalg.GpuMagmaSVD.

上级 1d6b79f6
...@@ -9,7 +9,8 @@ from numpy.linalg.linalg import LinAlgError ...@@ -9,7 +9,8 @@ from numpy.linalg.linalg import LinAlgError
import theano import theano
from theano import Op, config, tensor from theano import Op, config, tensor
from theano.gof import COp from theano.scalar import bool as bool_t
from theano.gof import COp, ParamsType
from theano.gpuarray import GpuArrayType from theano.gpuarray import GpuArrayType
from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
...@@ -350,9 +351,19 @@ def gpu_cholesky(A, lower=True): ...@@ -350,9 +351,19 @@ def gpu_cholesky(A, lower=True):
class GpuMagmaSVD(COp): class GpuMagmaSVD(COp):
"""Computes the svd of a matrix :math:`A` using magma library. """Computes the svd of a matrix :math:`A` using magma library.
.. warning::
Because of implementation constraints, this Op returns outputs
in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
to get them in expected order ``U, S, VT``.
""" """
__props__ = ('full_matrices', 'compute_uv') __props__ = ('full_matrices', 'compute_uv')
params_type = gpu_context_type _cop_num_inputs = 1
_cop_num_outputs = 3
check_input = False
params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)
def __init__(self, full_matrices=True, compute_uv=True): def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices self.full_matrices = full_matrices
...@@ -385,25 +396,19 @@ class GpuMagmaSVD(COp): ...@@ -385,25 +396,19 @@ class GpuMagmaSVD(COp):
assert A.dtype == 'float32' assert A.dtype == 'float32'
if self.compute_uv: if self.compute_uv:
return theano.Apply(self, [A], return theano.Apply(self, [A],
[A.type(), # return S, U, VT
GpuArrayType(A.dtype, broadcastable=[False], [GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(), context_name=ctx_name)(),
A.type()]) A.type(),
A.type()])
else: else:
return theano.Apply(self, [A], return theano.Apply(self, [A],
# return only S
[GpuArrayType(A.dtype, broadcastable=[False], [GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()]) context_name=ctx_name)()])
def get_params(self, node): def get_params(self, node):
return node.inputs[0].type.context return self.params_type.get_params(self, context=node.inputs[0].type.context)
def get_op_params(self):
params = []
if self.compute_uv:
params.append(('COMPUTE_UV', '1'))
if self.full_matrices:
params.append(('FULL_MATRICES', '1'))
return params
def infer_shape(self, node, shapes): def infer_shape(self, node, shapes):
x_shape, = shapes x_shape, = shapes
...@@ -413,7 +418,7 @@ class GpuMagmaSVD(COp): ...@@ -413,7 +418,7 @@ class GpuMagmaSVD(COp):
if self.compute_uv: if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K) u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N) vt_shape = (N, N) if self.full_matrices else (K, N)
return [u_shape, s_shape, vt_shape] return [s_shape, u_shape, vt_shape]
else: else:
return [s_shape] return [s_shape]
...@@ -438,7 +443,11 @@ def gpu_svd(a, full_matrices=1, compute_uv=1): ...@@ -438,7 +443,11 @@ def gpu_svd(a, full_matrices=1, compute_uv=1):
U, V, D : matrices U, V, D : matrices
""" """
return GpuMagmaSVD(full_matrices, compute_uv)(a) out = GpuMagmaSVD(full_matrices, compute_uv)(a)
if compute_uv:
S, U, VT = out
out = [U, S, VT]
return out
class GpuMagmaMatrixInverse(COp): class GpuMagmaMatrixInverse(COp):
......
...@@ -5,14 +5,11 @@ setup_ext_cuda(); ...@@ -5,14 +5,11 @@ setup_ext_cuda();
#section support_code_struct #section support_code_struct
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
#ifdef COMPUTE_UV
PyGpuArrayObject **U,
#endif
PyGpuArrayObject **S, PyGpuArrayObject **S,
#ifdef COMPUTE_UV PyGpuArrayObject **U, // may be NULL
PyGpuArrayObject **VT, PyGpuArrayObject **VT, // may be NULL
#endif PARAMS_TYPE* params) {
PyGpuContextObject *c) { bool compute_uv = (U != NULL);
magma_int_t *iwork = NULL, iunused[1]; magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info; magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
magma_vec_t jobz; magma_vec_t jobz;
...@@ -29,7 +26,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -29,7 +26,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
} }
// This is early to match the exit() in the fail label. // This is early to match the exit() in the fail label.
cuda_enter(c->ctx); cuda_enter(params->context->ctx);
magma_init(); magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) { if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
...@@ -63,32 +60,32 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -63,32 +60,32 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
goto fail; goto fail;
} }
#ifdef COMPUTE_UV if (compute_uv) {
#ifdef FULL_MATRICES if (params->full_matrices) {
jobz = MagmaAllVec; jobz = MagmaAllVec;
#else } else {
jobz = MagmaSomeVec; jobz = MagmaSomeVec;
#endif }
M_U = (jobz == MagmaAllVec ? M : K); M_U = (jobz == MagmaAllVec ? M : K);
N_VT = (jobz == MagmaAllVec ? N : K); N_VT = (jobz == MagmaAllVec ? N : K);
ldu = M; ldu = M;
ldv = N_VT; ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) { if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory"); "GpuMagmaSVD: failed to allocate memory");
goto fail; goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
} else {
jobz = MagmaNoVec;
ldu = M;
ldv = N;
} }
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
#else
jobz = MagmaNoVec;
ldu = M;
ldv = N;
#endif
// query for workspace size // query for workspace size
magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv, magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
...@@ -124,7 +121,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -124,7 +121,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
} }
s_dims[0] = K; s_dims[0] = K;
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){ if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory"); "GpuMagmaSVD: failed to allocate memory");
goto fail; goto fail;
...@@ -132,29 +129,29 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, ...@@ -132,29 +129,29 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float), cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice); cudaMemcpyDeviceToDevice);
#ifdef COMPUTE_UV if (compute_uv) {
u_dims[0] = N; u_dims[1] = N_VT; u_dims[0] = N; u_dims[1] = N_VT;
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){ if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory"); "GpuMagmaSVD: failed to allocate memory");
goto fail; goto fail;
} }
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output // to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float), cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
cudaMemcpyDeviceToDevice); cudaMemcpyDeviceToDevice);
vt_dims[0] = M_U; vt_dims[1] = M; vt_dims[0] = M_U; vt_dims[1] = M;
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){ if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
PyErr_SetString(PyExc_RuntimeError, PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory"); "GpuMagmaSVD: failed to allocate memory");
goto fail; goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
} }
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
#endif
res = 0; res = 0;
fail: fail:
if (a_data != NULL) if (a_data != NULL)
...@@ -170,6 +167,6 @@ fail: ...@@ -170,6 +167,6 @@ fail:
if (iwork != NULL) if (iwork != NULL)
magma_free_cpu(iwork); magma_free_cpu(iwork);
magma_finalize(); magma_finalize();
cuda_exit(c->ctx); cuda_exit(params->context->ctx);
return res; return res;
} }
...@@ -73,7 +73,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -73,7 +73,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD) cusolver_available, GpuMagmaMatrixInverse, gpu_svd)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -2149,11 +2149,16 @@ def local_gpu_svd(op, context_name, inputs, outputs): ...@@ -2149,11 +2149,16 @@ def local_gpu_svd(op, context_name, inputs, outputs):
return return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32']:
return return
op = GpuMagmaSVD(full_matrices=op.full_matrices, x = inputs[0]
compute_uv=op.compute_uv)
if inputs[0].dtype == 'float16': if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32')).astype('float16') x = inputs[0].astype('float32')
return op out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices)
if inputs[0].dtype == 'float16':
if op.compute_uv:
out = [o.astype('float16') for o in out]
else:
out = [out.astype('float16')]
return out
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论