提交 44726397 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5778 from aam-at/magma_gpu

Add magma gpu svd and matrix inverse
...@@ -679,6 +679,29 @@ import theano and print the config variable, as in: ...@@ -679,6 +679,29 @@ import theano and print the config variable, as in:
``'guess_once'``, ``'guess_on_shape_change'``, ``'time_once'``, ``'guess_once'``, ``'guess_on_shape_change'``, ``'time_once'``,
``'time_on_shape_change'``. ``'time_on_shape_change'``.
.. attribute:: config.magma.enabled
String value: ``'True'``, ``'False'``
Default: ``'False'``
If ``'True'``, use `magma <http://icl.cs.utk.edu/magma/>`_ for matrix
computations.
If ``'False'``, disable magma.
.. attribute:: config.magma.include_path
Default: ``''``
Location of the magma headers.
.. attribute:: config.magma.library_path
Default: ``''``
Location of the magma library.
.. attribute:: config.gcc.cxxflags .. attribute:: config.gcc.cxxflags
Default: ``""`` Default: ``""``
......
...@@ -361,6 +361,22 @@ AddConfigVar('dnn.enabled', ...@@ -361,6 +361,22 @@ AddConfigVar('dnn.enabled',
EnumStr("auto", "True", "False", "no_check"), EnumStr("auto", "True", "False", "no_check"),
in_c_key=False) in_c_key=False)
AddConfigVar('magma.include_path',
"Location of the magma header",
StrParam(''),
in_c_key=False)
AddConfigVar('magma.library_path',
"Location of the magma library",
StrParam(''),
in_c_key=False)
AddConfigVar('magma.enabled',
" If True, use magma for matrix computation."
" If False, disable magma",
BoolParam(False),
in_c_key=False)
# This flag determines whether or not to raise error/warning message if # This flag determines whether or not to raise error/warning message if
# there is a CPU Op in the computational graph. # there is a CPU Op in the computational graph.
AddConfigVar( AddConfigVar(
......
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import pkg_resources import os
import theano
import warnings import warnings
from theano import Op import pkg_resources
from theano.gpuarray import basic_ops, GpuArrayType
import numpy as np import numpy as np
from numpy.linalg.linalg import LinAlgError from numpy.linalg.linalg import LinAlgError
import theano
from theano import Op, config, tensor
from theano.gof import COp
from theano.gpuarray import GpuArrayType
from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
from .type import gpu_context_type
try: try:
import pygpu import pygpu
from pygpu.basic import triu, tril from pygpu.basic import triu, tril
...@@ -94,13 +99,13 @@ class GpuCusolverSolve(Op): ...@@ -94,13 +99,13 @@ class GpuCusolverSolve(Op):
'GpuCusolverSolve Op can not be constructed.') 'GpuCusolverSolve Op can not be constructed.')
if skcuda.__version__ <= '0.5.1': if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8') warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
context_name = basic_ops.infer_context_name(inp1, inp2) context_name = infer_context_name(inp1, inp2)
inp1 = basic_ops.as_gpuarray_variable(inp1, context_name) inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = basic_ops.as_gpuarray_variable(inp2, context_name) inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = basic_ops.gpu_contiguous(inp1) inp1 = gpu_contiguous(inp1)
inp2 = basic_ops.gpu_contiguous(inp2) inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices # this op can only operate on float32 matrices
assert inp1.ndim == 2 assert inp1.ndim == 2
...@@ -260,11 +265,11 @@ class GpuCholesky(Op): ...@@ -260,11 +265,11 @@ class GpuCholesky(Op):
if not pygpu_available: if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.' raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.') 'Install or update libgpuarray.')
context_name = basic_ops.infer_context_name(inp) context_name = infer_context_name(inp)
inp = basic_ops.as_gpuarray_variable(inp, context_name) inp = as_gpuarray_variable(inp, context_name)
inp = basic_ops.gpu_contiguous(inp) inp = gpu_contiguous(inp)
# this op can only operate on float32 matrices # this op can only operate on float32 matrices
# because of current implementation of triu/tril. # because of current implementation of triu/tril.
...@@ -341,3 +346,160 @@ class GpuCholesky(Op): ...@@ -341,3 +346,160 @@ class GpuCholesky(Op):
def gpu_cholesky(A, lower=True): def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A) return GpuCholesky(lower)(A)
class GpuMagmaSVD(COp):
"""Computes the svd of a matrix :math:`A` using magma library.
"""
__props__ = ('full_matrices', 'compute_uv')
params_type = gpu_context_type
def __init__(self, full_matrices=True, compute_uv=True):
self.full_matrices = full_matrices
self.compute_uv = compute_uv
COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def make_node(self, A):
ctx_name = infer_context_name(A)
A = as_gpuarray_variable(A, ctx_name)
if A.ndim != 2:
raise LinAlgError("Matrix rank error")
if self.compute_uv:
return theano.Apply(self, [A],
[A.type(),
GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)(),
A.type()])
else:
return theano.Apply(self, [A],
[GpuArrayType(A.dtype, broadcastable=[False],
context_name=ctx_name)()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
params = []
if self.compute_uv:
params.append(('COMPUTE_UV', '1'))
if self.full_matrices:
params.append(('FULL_MATRICES', '1'))
return params
def infer_shape(self, node, shapes):
x_shape, = shapes
M, N = x_shape
K = tensor.minimum(M, N)
s_shape = (K, )
if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N)
return [u_shape, s_shape, vt_shape]
else:
return [s_shape]
def gpu_svd(a, full_matrices=1, compute_uv=1):
"""
This function performs the SVD on GPU.
Parameters
----------
full_matrices : bool, optional
If True (default), u and v have the shapes (M, M) and (N, N),
respectively.
Otherwise, the shapes are (M, K) and (K, N), respectively,
where K = min(M, N).
compute_uv : bool, optional
Whether or not to compute u and v in addition to s.
True by default.
Returns
-------
U, V, D : matrices
"""
return GpuMagmaSVD(full_matrices, compute_uv)(a)
class GpuMagmaMatrixInverse(COp):
"""Computes the inverse of a matrix :math:`A` using magma library.
"""
__props__ = ('inplace', )
params_type = gpu_context_type
def __init__(self, inplace=False):
COp.__init__(self, ['magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def c_headers(self):
return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
dirs = [os.path.dirname(__file__), pygpu.get_include()]
if config.magma.include_path:
dirs.append(config.magma.include_path)
return dirs
def c_libraries(self):
return ['magma']
def c_lib_dirs(self):
if config.magma.library_path:
return [config.magma.library_path]
return []
def clone_inplace(self):
return self.__class__(inplace=True)
def make_node(self, x):
ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, ctx_name)
if x.ndim != 2:
raise LinAlgError("Matrix rank error")
return theano.Apply(self, [x], [x.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
def infer_shape(self, node, shapes):
return shapes
def gpu_matrix_inverse(a):
"""
This function performs the matrix inverse on GPU.
Returns
-------
a_inv: matrix
"""
return GpuMagmaMatrixInverse()(a)
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
PyGpuContextObject *c) {
const size_t *dims;
magma_int_t N, ldwork, info;
magma_int_t *piv = NULL;
gpudata *dwork = NULL;
int res = -1;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error");
goto fail;
}
dims = PyGpuArray_DIMS(A);
if (dims[0] != dims[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix is not square");
goto fail;
}
#ifdef INPLACE
Py_XDECREF(*A_inv);
*A_inv = A;
Py_INCREF(*A_inv);
#else
*A_inv = theano_try_copy(*A_inv, A);
if (*A_inv == NULL) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate memory for the output");
goto fail;
}
#endif
// magma matrix inverse
N = dims[0];
ldwork = N * magma_get_sgetri_nb(N);
dwork = gpudata_alloc(c->ctx, ldwork * sizeof(float), NULL, 0, NULL);
if (dwork == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate working memory");
goto fail;
}
if (magma_imalloc_cpu(&piv, N)) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate memory for the pivot array");
goto fail;
}
magma_sgetrf_gpu(N, N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: magma_sgetrf_gpu returned error %d: %s.", info,
magma_strerror(info));
goto fail;
}
magma_sgetri_gpu(N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv,
*(float **)dwork, ldwork, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: magma_sgetri_gpu returned error %d: %s.", info,
magma_strerror(info));
goto fail;
}
res = 0;
fail:
if (piv != NULL)
magma_free(piv);
if (dwork != NULL)
gpudata_release(dwork);
magma_finalize();
cuda_exit(c->ctx);
return res;
}
#section init_code
setup_ext_cuda();
#section support_code_struct
int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
#ifdef COMPUTE_UV
PyGpuArrayObject **U,
#endif
PyGpuArrayObject **S,
#ifdef COMPUTE_UV
PyGpuArrayObject **VT,
#endif
PyGpuContextObject *c) {
magma_int_t *iwork = NULL, iunused[1];
magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
magma_vec_t jobz;
size_t s_dims[1], u_dims[2], vt_dims[2];
float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
*work = NULL;
float dummy[1];
int res = -1, lwork;
if (A->ga.typecode != GA_FLOAT) {
PyErr_SetString(PyExc_TypeError,
"GpuMagmaMatrixInverse: Unsupported data type");
return -1;
}
// This is early to match the exit() in the fail label.
cuda_enter(c->ctx);
magma_init();
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error");
goto fail;
}
// magma matrix svd
// reverse dimensions because MAGMA expects column-major matrices:
M = PyGpuArray_DIM(A, 1);
N = PyGpuArray_DIM(A, 0);
K = std::min(M, N);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
cudaMemcpyDeviceToDevice);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&s_data, K)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
#ifdef COMPUTE_UV
#ifdef FULL_MATRICES
jobz = MagmaAllVec;
#else
jobz = MagmaSomeVec;
#endif
M_U = (jobz == MagmaAllVec ? M : K);
N_VT = (jobz == MagmaAllVec ? N : K);
ldu = M;
ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
#else
jobz = MagmaNoVec;
ldu = M;
ldv = N;
#endif
// query for workspace size
magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
dummy, -1, iunused, &info);
lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate working memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate working memory");
goto fail;
}
// compute svd
magma_sgesdd(jobz, M, N, a_data, M, s_data,
u_data, ldu, vt_data, ldv, work, lwork, iwork, &info);
if (info > 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)",
info);
goto fail;
} else if (info < 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info);
goto fail;
}
s_dims[0] = K;
if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice);
#ifdef COMPUTE_UV
u_dims[0] = N; u_dims[1] = N_VT;
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
cudaMemcpyDeviceToDevice);
vt_dims[0] = M_U; vt_dims[1] = M;
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
#endif
res = 0;
fail:
if (a_data != NULL)
magma_free_pinned(a_data);
if (s_data != NULL)
magma_free_pinned(s_data);
if (u_data != NULL)
magma_free_pinned(u_data);
if (vt_data != NULL)
magma_free_pinned(vt_data);
if (work != NULL)
magma_free_pinned(work);
if (iwork != NULL)
magma_free_cpu(iwork);
magma_finalize();
cuda_exit(c->ctx);
return res;
}
...@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv, ...@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d, AbstractConv3d,
AbstractConv3d_gradWeights, AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs) AbstractConv3d_gradInputs)
import theano.tensor.nlinalg as nlinalg
import theano.tensor.signal.pool as pool import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg import theano.tensor.slinalg as slinalg
...@@ -71,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -71,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available) cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -2004,6 +2005,34 @@ def local_inplace_cholesky(node): ...@@ -2004,6 +2005,34 @@ def local_inplace_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace: if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)] return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.MatrixInverse])
@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
return GpuMagmaMatrixInverse()
@register_inplace()
@local_optimizer([GpuMagmaMatrixInverse])
def local_inplace_matrix_inverse_inplace(node):
if isinstance(node.op, GpuMagmaMatrixInverse):
if not node.op.inplace:
return [node.op.clone_inplace()(*node.inputs)]
@register_opt('magma', 'fast_compile')
@op_lifter([nlinalg.SVD])
@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
def local_gpu_svd(op, context_name, inputs, outputs):
if not config.magma.enabled:
return
return GpuMagmaSVD(full_matrices=op.full_matrices,
compute_uv=op.compute_uv)
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
......
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import unittest import unittest
import numpy as np import numpy as np
import theano from numpy.linalg.linalg import LinAlgError
import theano
from theano import config
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse,
cusolver_available, gpu_matrix_inverse,
gpu_solve, gpu_svd)
from theano.tensor.nlinalg import matrix_inverse
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .config import mode_with_gpu
from numpy.linalg.linalg import LinAlgError from .. import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)
if not cusolver_available: class TestCusolver(unittest.TestCase):
raise SkipTest('Optional package scikits.cuda.cusolver not available')
def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
class TestCusolver(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val, A_struct=None): def run_gpu_solve(self, A_val, x_val, A_struct=None):
b_val = np.dot(A_val, x_val) b_val = np.dot(A_val, x_val)
b_val_trans = np.dot(A_val.T, x_val) b_val_trans = np.dot(A_val.T, x_val)
...@@ -117,6 +123,8 @@ class TestCusolver(unittest.TestCase): ...@@ -117,6 +123,8 @@ class TestCusolver(unittest.TestCase):
class TestGpuCholesky(unittest.TestCase): class TestGpuCholesky(unittest.TestCase):
def setUp(self): def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
utt.seed_rng() utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False): def get_gpu_cholesky_func(self, lower=True, inplace=False):
...@@ -191,3 +199,97 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -191,3 +199,97 @@ class TestGpuCholesky(unittest.TestCase):
A_val = -M_val.dot(M_val.T) A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False) fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val) self.assertRaises(LinAlgError, fn, A_val)
class TestMagma(unittest.TestCase):
def setUp(self):
if not config.magma.enabled:
self.skipTest('Magma is not enabled, skipping test')
def test_gpu_matrix_inverse(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
N = 1000
A_val = rand(N, N).astype('float32')
A_val_inv = fn(A_val)
utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
def test_gpu_matrix_inverse_inplace(self):
N = 1000
A_val_gpu = gpuarray_shared_constructor(rand(N, N).astype('float32'))
A_val_copy = A_val_gpu.get_value()
fn = theano.function([], GpuMagmaMatrixInverse(inplace=True)(A_val_gpu),
mode=mode_with_gpu, accept_inplace=True)
fn()
utt.assert_allclose(np.dot(A_val_gpu.get_value(), A_val_copy), np.eye(N), atol=1e-3)
def test_gpu_matrix_inverse_inplace_opt(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], matrix_inverse(A), mode=mode_with_gpu)
assert any([
node.op.inplace
for node in fn.maker.fgraph.toposort() if
isinstance(node.op, GpuMagmaMatrixInverse)
])
def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
A = theano.tensor.fmatrix("A")
f = theano.function(
[A], gpu_svd(A, full_matrices=full_matrices, compute_uv=compute_uv),
mode=mode_with_gpu)
return f(A_val)
def assert_column_orthonormal(self, Ot):
utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))
def check_svd(self, A, U, S, VT, rtol=None, atol=None):
S_m = np.zeros_like(A)
np.fill_diagonal(S_m, S)
utt.assert_allclose(
np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
def test_gpu_svd_wide(self):
A = rand(100, 50).astype('float32')
M, N = A.shape
U, S, VT = self.run_gpu_svd(A)
self.assert_column_orthonormal(U)
self.assert_column_orthonormal(VT.T)
self.check_svd(A, U, S, VT)
U, S, VT = self.run_gpu_svd(A, full_matrices=False)
self.assertEqual(U.shape[1], min(M, N))
self.assert_column_orthonormal(U)
self.assertEqual(VT.shape[0], min(M, N))
self.assert_column_orthonormal(VT.T)
def test_gpu_svd_tall(self):
A = rand(50, 100).astype('float32')
M, N = A.shape
U, S, VT = self.run_gpu_svd(A)
self.assert_column_orthonormal(U)
self.assert_column_orthonormal(VT.T)
self.check_svd(A, U, S, VT)
U, S, VT = self.run_gpu_svd(A, full_matrices=False)
self.assertEqual(U.shape[1], min(M, N))
self.assert_column_orthonormal(U)
self.assertEqual(VT.shape[0], min(M, N))
self.assert_column_orthonormal(VT.T)
def test_gpu_singular_values(self):
A = theano.tensor.fmatrix("A")
f_cpu = theano.function(
[A], theano.tensor.nlinalg.svd(A, compute_uv=False),
mode=mode_without_gpu)
f_gpu = theano.function(
[A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu)
A_val = rand(50, 100).astype('float32')
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
A_val = rand(100, 50).astype('float32')
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
...@@ -616,18 +616,37 @@ class SVD(Op): ...@@ -616,18 +616,37 @@ class SVD(Op):
def make_node(self, x): def make_node(self, x):
x = as_tensor_variable(x) x = as_tensor_variable(x)
assert x.ndim == 2, "The input of svd function should be a matrix." assert x.ndim == 2, "The input of svd function should be a matrix."
w = theano.tensor.matrix(dtype=x.dtype) s = theano.tensor.vector(dtype=x.dtype)
u = theano.tensor.vector(dtype=x.dtype) if self.compute_uv:
v = theano.tensor.matrix(dtype=x.dtype) u = theano.tensor.matrix(dtype=x.dtype)
return Apply(self, [x], [w, u, v]) vt = theano.tensor.matrix(dtype=x.dtype)
return Apply(self, [x], [u, s, vt])
else:
return Apply(self, [x], [s])
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
(x,) = inputs (x,) = inputs
(w, u, v) = outputs
assert x.ndim == 2, "The input of svd function should be a matrix." assert x.ndim == 2, "The input of svd function should be a matrix."
w[0], u[0], v[0] = self._numop(x, if self.compute_uv:
self.full_matrices, u, s, vt = outputs
self.compute_uv) u[0], s[0], vt[0] = self._numop(x,
self.full_matrices,
self.compute_uv)
else:
s, = outputs
s[0] = self._numop(x, self.full_matrices, self.compute_uv)
def infer_shape(self, node, shapes):
x_shape, = shapes
M, N = x_shape
K = tensor.minimum(M, N)
s_shape = (K, )
if self.compute_uv:
u_shape = (M, M) if self.full_matrices else (M, K)
vt_shape = (N, N) if self.full_matrices else (K, N)
return [u_shape, s_shape, vt_shape]
else:
return [s_shape]
def svd(a, full_matrices=1, compute_uv=1): def svd(a, full_matrices=1, compute_uv=1):
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import unittest import unittest
import itertools
import numpy as np import numpy as np
import numpy.linalg import numpy.linalg
from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_almost_equal
...@@ -20,7 +21,7 @@ from theano.tensor.nlinalg import ( ...@@ -20,7 +21,7 @@ from theano.tensor.nlinalg import (
AllocDiag, alloc_diag, ExtractDiag, extract_diag, diag, AllocDiag, alloc_diag, ExtractDiag, extract_diag, diag,
trace, Det, det, Eig, eig, Eigh, EighGrad, eigh, trace, Det, det, Eig, eig, Eigh, EighGrad, eigh,
matrix_dot, _zero_disconnected, qr, matrix_power, matrix_dot, _zero_disconnected, qr, matrix_power,
norm, svd, TensorInv, tensorinv, tensorsolve) norm, svd, SVD, TensorInv, tensorinv, tensorsolve)
from nose.plugins.attrib import attr from nose.plugins.attrib import attr
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
...@@ -128,18 +129,46 @@ def test_qr_modes(): ...@@ -128,18 +129,46 @@ def test_qr_modes():
assert "name 'complete' is not defined" in str(e) assert "name 'complete' is not defined" in str(e)
def test_svd(): class test_SVD(utt.InferShapeTester):
rng = np.random.RandomState(utt.fetch_seed()) op_class = SVD
A = tensor.matrix("A", dtype=theano.config.floatX) dtype = 'float32'
U, V, T = svd(A)
fn = function([A], [U, V, T])
a = rng.rand(4, 4).astype(theano.config.floatX)
n_u, n_v, n_t = np.linalg.svd(a)
t_u, t_v, t_t = fn(a)
assert _allclose(n_u, t_u) def setUp(self):
assert _allclose(n_v, t_v) super(test_SVD, self).setUp()
assert _allclose(n_t, t_t) self.rng = np.random.RandomState(utt.fetch_seed())
self.A = theano.tensor.matrix(dtype=self.dtype)
self.op = svd
def test_svd(self):
A = tensor.matrix("A", dtype=self.dtype)
U, S, VT = svd(A)
fn = function([A], [U, S, VT])
a = self.rng.rand(4, 4).astype(self.dtype)
n_u, n_s, n_vt = np.linalg.svd(a)
t_u, t_s, t_vt = fn(a)
assert _allclose(n_u, t_u)
assert _allclose(n_s, t_s)
assert _allclose(n_vt, t_vt)
fn = function([A], svd(A, compute_uv=False))
t_s = fn(a)
assert _allclose(n_s, t_s)
def test_svd_infer_shape(self):
self.validate_shape((4, 4), full_matrices=True, compute_uv=True)
self.validate_shape((4, 4), full_matrices=False, compute_uv=True)
self.validate_shape((2, 4), full_matrices=False, compute_uv=True)
self.validate_shape((4, 2), full_matrices=False, compute_uv=True)
self.validate_shape((4, 4), compute_uv=False)
def validate_shape(self, shape, compute_uv=True, full_matrices=True):
A = self.A
A_v = self.rng.rand(*shape).astype(self.dtype)
outputs = self.op(A, full_matrices=full_matrices, compute_uv=compute_uv)
if not compute_uv:
outputs = [outputs]
self._compile_and_check([A], outputs, [A_v], self.op_class, warn=False)
def test_tensorsolve(): def test_tensorsolve():
...@@ -403,8 +432,7 @@ class test_Eig(utt.InferShapeTester): ...@@ -403,8 +432,7 @@ class test_Eig(utt.InferShapeTester):
super(test_Eig, self).setUp() super(test_Eig, self).setUp()
self.rng = np.random.RandomState(utt.fetch_seed()) self.rng = np.random.RandomState(utt.fetch_seed())
self.A = theano.tensor.matrix(dtype=self.dtype) self.A = theano.tensor.matrix(dtype=self.dtype)
self.X = np.asarray(self.rng.rand(5, 5), self.X = np.asarray(self.rng.rand(5, 5), dtype=self.dtype)
dtype=self.dtype)
self.S = self.X.dot(self.X.T) self.S = self.X.dot(self.X.T)
def test_infer_shape(self): def test_infer_shape(self):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论