提交 4cfa236d authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Add magma matrix inverse and lifting optimization

上级 91835cc8
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import pkg_resources import os
import theano
import warnings import warnings
from theano import Op
from theano.gpuarray import basic_ops, GpuArrayType
import numpy as np import numpy as np
import pkg_resources
from numpy.linalg.linalg import LinAlgError from numpy.linalg.linalg import LinAlgError
import theano
from theano import Op
from theano.gof import COp
from theano.gpuarray import GpuArrayType
from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
from .type import gpu_context_type
try: try:
import pygpu import pygpu
from pygpu.basic import triu, tril from pygpu.basic import triu, tril
...@@ -94,13 +99,13 @@ class GpuCusolverSolve(Op): ...@@ -94,13 +99,13 @@ class GpuCusolverSolve(Op):
'GpuCusolverSolve Op can not be constructed.') 'GpuCusolverSolve Op can not be constructed.')
if skcuda.__version__ <= '0.5.1': if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8') warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
context_name = basic_ops.infer_context_name(inp1, inp2) context_name = infer_context_name(inp1, inp2)
inp1 = basic_ops.as_gpuarray_variable(inp1, context_name) inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = basic_ops.as_gpuarray_variable(inp2, context_name) inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = basic_ops.gpu_contiguous(inp1) inp1 = gpu_contiguous(inp1)
inp2 = basic_ops.gpu_contiguous(inp2) inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices # this op can only operate on float32 matrices
assert inp1.ndim == 2 assert inp1.ndim == 2
...@@ -260,11 +265,11 @@ class GpuCholesky(Op): ...@@ -260,11 +265,11 @@ class GpuCholesky(Op):
if not pygpu_available: if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.' raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.') 'Install or update libgpuarray.')
context_name = basic_ops.infer_context_name(inp) context_name = infer_context_name(inp)
inp = basic_ops.as_gpuarray_variable(inp, context_name) inp = as_gpuarray_variable(inp, context_name)
inp = basic_ops.gpu_contiguous(inp) inp = gpu_contiguous(inp)
# this op can only operate on float32 matrices # this op can only operate on float32 matrices
# because of current implementation of triu/tril. # because of current implementation of triu/tril.
...@@ -341,3 +346,46 @@ class GpuCholesky(Op): ...@@ -341,3 +346,46 @@ class GpuCholesky(Op):
def gpu_cholesky(A, lower=True): def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A) return GpuCholesky(lower)(A)
class GpuMagmaMatrixInverse(COp):
"""Computes the inverse of a matrix :math:`A` using magma library.
"""
__props__ = ('inplace', )
params_type = gpu_context_type
def __init__(self, inplace=False):
COp.__init__(self, ['magma_linalg.c'],
'APPLY_SPECIFIC(magma_matrix_inv)')
self.inplace = inplace
def c_headers(self):
return ['gpuarray/array.h', 'gpuarray/blas.h', 'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self):
return [os.path.dirname(__file__), pygpu.get_include()]
def c_libraries(self):
return ['magma']
def make_node(self, x):
if x.ndim != 2:
raise LinAlgError("Matrix rank error")
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
return theano.Apply(self, [x], [x.type()])
def get_params(self, node):
return node.inputs[0].type.context
def get_op_params(self):
if self.inplace:
return [('INPLACE', '1')]
else:
return []
def infer_shape(self, node, shapes):
return shapes
gpu_matrix_inverse = GpuMagmaMatrixInverse()
#section support_code_struct
float *APPLY_SPECIFIC(dwork);
magma_int_t *APPLY_SPECIFIC(piv);
#section init_code_struct
APPLY_SPECIFIC(dwork) = NULL;
APPLY_SPECIFIC(piv) = NULL;
#section cleanup_code_struct
if (APPLY_SPECIFIC(dwork) != NULL) {magma_free(APPLY_SPECIFIC(dwork));}
if (APPLY_SPECIFIC(piv) != NULL) {magma_free(APPLY_SPECIFIC(piv));}
#section support_code_struct
int APPLY_SPECIFIC(magma_matrix_inv)(PyGpuArrayObject *A,
PyGpuArrayObject **_A_inv,
PyGpuContextObject *ctx) {
PyGpuArrayObject *A_inv = *_A_inv;
if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: requires data to be C-contiguous");
return 1;
}
if (PyGpuArray_NDIM(A) != 2) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix rank error");
return 1;
}
const size_t *x_dims = PyGpuArray_DIMS(A);
if (x_dims[0] != x_dims[1]) {
PyErr_SetString(PyExc_ValueError,
"GpuMagmaMatrixInverse: matrix is not square");
return 1;
}
#ifdef INPLACE
Py_XDECREF(out);
A_inv = A;
Py_INCREF(out);
#else
A_inv = theano_try_copy(A_inv, A);
if (A_inv == NULL) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate memory");
return 1;
}
#endif
{
// magma matrix inverse
magma_init();
magma_int_t ldwork, info;
magma_int_t N = x_dims[0];
ldwork = N * magma_get_sgetri_nb(N);
if (magma_smalloc(&APPLY_SPECIFIC(dwork), ldwork)) {
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate magma working memory");
return 1;
}
APPLY_SPECIFIC(piv) = (magma_int_t *)malloc(N * sizeof(magma_int_t));
float *A_ptr = (float *) ((void **)A_inv->ga.data)[0];
magma_sgetri_gpu(N, A_ptr, N, APPLY_SPECIFIC(piv), APPLY_SPECIFIC(dwork), ldwork, &info);
if (info != 0) {
PyErr_Format(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: magma_sgetri_gpu returned error %d", info);
return 1;
}
magma_finalize();
}
*_A_inv = A_inv;
return 0;
}
...@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv, ...@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d, AbstractConv3d,
AbstractConv3d_gradWeights, AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs) AbstractConv3d_gradInputs)
import theano.tensor.nlinalg as nlinalg
import theano.tensor.signal.pool as pool import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg import theano.tensor.slinalg as slinalg
...@@ -71,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -71,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available) cusolver_available, GpuMagmaMatrixInverse)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -2004,6 +2005,17 @@ def local_inplace_cholesky(node): ...@@ -2004,6 +2005,17 @@ def local_inplace_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace: if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)] return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
@register_opt('fast_compile')
@op_lifter([nlinalg.MatrixInverse])
@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'fast_compile')
def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
magma_available = True
if not magma_available:
return
return GpuMagmaMatrixInverse()
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
......
...@@ -10,14 +10,16 @@ from .config import mode_with_gpu ...@@ -10,14 +10,16 @@ from .config import mode_with_gpu
from numpy.linalg.linalg import LinAlgError from numpy.linalg.linalg import LinAlgError
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky,
from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky) gpu_matrix_inverse)
if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available')
class TestCusolver(unittest.TestCase): class TestCusolver(unittest.TestCase):
def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
def run_gpu_solve(self, A_val, x_val, A_struct=None): def run_gpu_solve(self, A_val, x_val, A_struct=None):
b_val = np.dot(A_val, x_val) b_val = np.dot(A_val, x_val)
b_val_trans = np.dot(A_val.T, x_val) b_val_trans = np.dot(A_val.T, x_val)
...@@ -117,6 +119,8 @@ class TestCusolver(unittest.TestCase): ...@@ -117,6 +119,8 @@ class TestCusolver(unittest.TestCase):
class TestGpuCholesky(unittest.TestCase): class TestGpuCholesky(unittest.TestCase):
def setUp(self): def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
utt.seed_rng() utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False): def get_gpu_cholesky_func(self, lower=True, inplace=False):
...@@ -191,3 +195,14 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -191,3 +195,14 @@ class TestGpuCholesky(unittest.TestCase):
A_val = -M_val.dot(M_val.T) A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False) fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val) self.assertRaises(LinAlgError, fn, A_val)
class TestMagma(unittest.TestCase):
def test_gpu_matrix_inverse(self):
A = theano.tensor.fmatrix("A")
fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
N = 1000
A_val = np.random.rand(N, N).astype(np.float32)
A_val_inv = fn(A_val)
utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论