提交 06addd9c authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5799 from tfjgeorge/gpu_cholesky

Gpu cholesky
......@@ -12,8 +12,10 @@ from numpy.linalg.linalg import LinAlgError
try:
import pygpu
from pygpu.basic import triu, tril
pygpu_available = True
except ImportError:
pass
pygpu_available = False
cusolver_available = False
try:
......@@ -52,6 +54,13 @@ if cusolver_available:
cusolver.cusolverCheckStatus(status)
def attach_cusolver_handle_to_context(ctx):
handle = getattr(ctx, 'cusolver_handle', None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
class GpuCusolverSolve(Op):
"""
CUSOLVER GPU solver OP.
......@@ -101,10 +110,7 @@ class GpuCusolverSolve(Op):
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
handle = getattr(ctx, 'cusolver_handle', None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
attach_cusolver_handle_to_context(ctx)
def check_dev_info(self, dev_info):
val = np.asarray(dev_info)[0]
......@@ -212,3 +218,120 @@ class GpuCusolverSolve(Op):
def gpu_solve(A, b, A_structure='general', trans='N'):
return GpuCusolverSolve(A_structure, trans)(A, b)
class GpuCholesky(Op):
"""
CUSOLVER GPU Cholesky Op.
Given a real positive definite matrix `A` returns either a lower
triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
if `lower == False`.
Parameters
----------
lower
Whether to return a lower rather than upper triangular decomposition.
"""
__props__ = ('lower', 'inplace')
def __init__(self, lower=True, inplace=False):
self.lower = lower
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
super(GpuCholesky, self).__init__()
def make_node(self, inp):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.')
context_name = basic_ops.infer_context_name(inp)
inp = basic_ops.as_gpuarray_variable(inp, context_name)
inp = basic_ops.gpu_contiguous(inp)
# this op can only operate on float32 matrices
# because of current implementation of triu/tril.
# TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
assert inp.ndim == 2
assert inp.dtype == 'float32'
return theano.Apply(self, [inp], [inp.type()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Input matrix.
A = inputs[0]
l, n = A.shape
if l != n:
raise ValueError('A must be a square matrix')
lda = max(1, n)
# cusolver operates on F ordered matrices, but A is expected
# to be symmetric so it does not matter.
# We copy A if needed
if self.inplace:
L = A
else:
L = pygpu.array(A, copy=True)
# The output matrix will contain only the upper or lower
# triangular factorization of A. If L is C ordered (it
# probably is as it is the default in Theano) we just switch
# the fill mode parameter of cusolver
l_parameter = 0 if self.lower else 1
if L.flags['C_CONTIGUOUS']:
l_parameter = 1 - l_parameter
L_ptr = L.gpudata
with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32',
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
cusolver.cusolverDnSpotrf(
context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
# cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside
# the triangle
if self.lower:
tril(L)
else:
triu(L)
outputs[0][0] = L
def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A)
......@@ -70,7 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, cusolver_available)
from .linalg import (GpuCusolverSolve, GpuCholesky, cusolver_available)
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -1967,6 +1967,23 @@ def local_gpu_solve(op, context_name, inputs, outputs):
return
return GpuCusolverSolve()
# Cholesky decomposition
@register_opt('fast_compile')
@op_lifter([slinalg.Cholesky])
@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available:
return
return GpuCholesky(lower=op.lower, inplace=op.destructive)
@register_inplace()
@local_optimizer([GpuCholesky], inplace=True)
def local_inplace_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace',
......
......@@ -11,7 +11,7 @@ from numpy.linalg.linalg import LinAlgError
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)
if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available')
......@@ -112,3 +112,82 @@ class TestCusolver(unittest.TestCase):
fn = theano.function([A, b], [solver], mode=mode_with_gpu)
self.assertRaises(LinAlgError, fn, A_val, x_val)
class TestGpuCholesky(unittest.TestCase):
def setUp(self):
utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = theano.tensor.matrix("A", dtype="float32")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float32")
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(ValueError, fn, A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = theano.tensor.vector("A", dtype="float32")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = theano.tensor.tensor3("A", dtype="float32")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float32") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float32")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
......@@ -17,7 +17,7 @@ from ..basic_ops import (
from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available
from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest
......@@ -573,7 +573,7 @@ def test_local_lift_solve():
A = tensor.fmatrix()
b = tensor.fmatrix()
o = slinalg.solve(A, b)
f_cpu = theano.function([A, b], o)
f_cpu = theano.function([A, b], o, mode_without_gpu)
f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Solve)
for n in f_gpu.maker.fgraph.apply_nodes)
......@@ -584,6 +584,43 @@ def test_local_lift_solve():
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def test_local_lift_cholesky():
if not cusolver_available:
raise SkipTest('No cuSolver')
A = tensor.fmatrix()
o = slinalg.cholesky(A)
f_cpu = theano.function([A], o, mode=mode_without_gpu)
f_gpu = theano.function([A], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Cholesky)
for n in f_gpu.maker.fgraph.apply_nodes)
# GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
assert any(isinstance(n.op, GpuCholesky) and n.op.inplace
for n in f_gpu.maker.fgraph.apply_nodes)
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def test_gpu_cholesky_not_inplace():
if not cusolver_available:
raise SkipTest('No cuSolver')
A = tensor.fmatrix()
A_squared = A**2
B = slinalg.cholesky(A_squared)
D = B + A_squared
f_cpu = theano.function([A], D, mode=mode_without_gpu)
f_gpu = theano.function([A], D, mode=mode_with_gpu)
# GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
count_cholesky_not_inplace = len([n.op for n in f_gpu.maker.fgraph.apply_nodes
if isinstance(n.op, GpuCholesky) and not n.op.inplace])
assert count_cholesky_not_inplace == 1, count_cholesky_not_inplace
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def test_local_gpua_advanced_incsubtensor():
# test a corner case reported at gh-5589
target = tensor.ftensor4()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论