提交 06addd9c authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5799 from tfjgeorge/gpu_cholesky

Gpu cholesky
...@@ -12,8 +12,10 @@ from numpy.linalg.linalg import LinAlgError ...@@ -12,8 +12,10 @@ from numpy.linalg.linalg import LinAlgError
try: try:
import pygpu import pygpu
from pygpu.basic import triu, tril
pygpu_available = True
except ImportError: except ImportError:
pass pygpu_available = False
cusolver_available = False cusolver_available = False
try: try:
...@@ -52,6 +54,13 @@ if cusolver_available: ...@@ -52,6 +54,13 @@ if cusolver_available:
cusolver.cusolverCheckStatus(status) cusolver.cusolverCheckStatus(status)
def attach_cusolver_handle_to_context(ctx):
handle = getattr(ctx, 'cusolver_handle', None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
class GpuCusolverSolve(Op): class GpuCusolverSolve(Op):
""" """
CUSOLVER GPU solver OP. CUSOLVER GPU solver OP.
...@@ -101,10 +110,7 @@ class GpuCusolverSolve(Op): ...@@ -101,10 +110,7 @@ class GpuCusolverSolve(Op):
def prepare_node(self, node, storage_map, compute_map, impl): def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context ctx = node.inputs[0].type.context
handle = getattr(ctx, 'cusolver_handle', None) attach_cusolver_handle_to_context(ctx)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
def check_dev_info(self, dev_info): def check_dev_info(self, dev_info):
val = np.asarray(dev_info)[0] val = np.asarray(dev_info)[0]
...@@ -212,3 +218,120 @@ class GpuCusolverSolve(Op): ...@@ -212,3 +218,120 @@ class GpuCusolverSolve(Op):
def gpu_solve(A, b, A_structure='general', trans='N'): def gpu_solve(A, b, A_structure='general', trans='N'):
return GpuCusolverSolve(A_structure, trans)(A, b) return GpuCusolverSolve(A_structure, trans)(A, b)
class GpuCholesky(Op):
"""
CUSOLVER GPU Cholesky Op.
Given a real positive definite matrix `A` returns either a lower
triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
if `lower == False`.
Parameters
----------
lower
Whether to return a lower rather than upper triangular decomposition.
"""
__props__ = ('lower', 'inplace')
def __init__(self, lower=True, inplace=False):
self.lower = lower
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
super(GpuCholesky, self).__init__()
def make_node(self, inp):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.')
context_name = basic_ops.infer_context_name(inp)
inp = basic_ops.as_gpuarray_variable(inp, context_name)
inp = basic_ops.gpu_contiguous(inp)
# this op can only operate on float32 matrices
# because of current implementation of triu/tril.
# TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
assert inp.ndim == 2
assert inp.dtype == 'float32'
return theano.Apply(self, [inp], [inp.type()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cusolver_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Input matrix.
A = inputs[0]
l, n = A.shape
if l != n:
raise ValueError('A must be a square matrix')
lda = max(1, n)
# cusolver operates on F ordered matrices, but A is expected
# to be symmetric so it does not matter.
# We copy A if needed
if self.inplace:
L = A
else:
L = pygpu.array(A, copy=True)
# The output matrix will contain only the upper or lower
# triangular factorization of A. If L is C ordered (it
# probably is as it is the default in Theano) we just switch
# the fill mode parameter of cusolver
l_parameter = 0 if self.lower else 1
if L.flags['C_CONTIGUOUS']:
l_parameter = 1 - l_parameter
L_ptr = L.gpudata
with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32',
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata
cusolver.cusolverDnSpotrf(
context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0:
raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
# cusolver leaves the elements in the matrix outside the considered
# upper or lower triangle unchanged, so we need to put zeros outside
# the triangle
if self.lower:
tril(L)
else:
triu(L)
outputs[0][0] = L
def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A)
...@@ -70,7 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -70,7 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, cusolver_available) from .linalg import (GpuCusolverSolve, GpuCholesky, cusolver_available)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -1967,6 +1967,23 @@ def local_gpu_solve(op, context_name, inputs, outputs): ...@@ -1967,6 +1967,23 @@ def local_gpu_solve(op, context_name, inputs, outputs):
return return
return GpuCusolverSolve() return GpuCusolverSolve()
# Cholesky decomposition
@register_opt('fast_compile')
@op_lifter([slinalg.Cholesky])
@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available:
return
return GpuCholesky(lower=op.lower, inplace=op.destructive)
@register_inplace()
@local_optimizer([GpuCholesky], inplace=True)
def local_inplace_cholesky(node):
if isinstance(node.op, GpuCholesky) and not node.op.inplace:
return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
......
...@@ -11,7 +11,7 @@ from numpy.linalg.linalg import LinAlgError ...@@ -11,7 +11,7 @@ from numpy.linalg.linalg import LinAlgError
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
from theano.gpuarray.linalg import (cusolver_available, gpu_solve) from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)
if not cusolver_available: if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available') raise SkipTest('Optional package scikits.cuda.cusolver not available')
...@@ -112,3 +112,82 @@ class TestCusolver(unittest.TestCase): ...@@ -112,3 +112,82 @@ class TestCusolver(unittest.TestCase):
fn = theano.function([A, b], [solver], mode=mode_with_gpu) fn = theano.function([A, b], [solver], mode=mode_with_gpu)
self.assertRaises(LinAlgError, fn, A_val, x_val) self.assertRaises(LinAlgError, fn, A_val, x_val)
class TestGpuCholesky(unittest.TestCase):
def setUp(self):
utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = theano.tensor.matrix("A", dtype="float32")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float32")
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(ValueError, fn, A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = theano.tensor.vector("A", dtype="float32")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = theano.tensor.tensor3("A", dtype="float32")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float32") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float32")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
...@@ -17,7 +17,7 @@ from ..basic_ops import ( ...@@ -17,7 +17,7 @@ from ..basic_ops import (
from ..blas import GpuGemm from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve, cusolver_available from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky
from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest
...@@ -573,7 +573,7 @@ def test_local_lift_solve(): ...@@ -573,7 +573,7 @@ def test_local_lift_solve():
A = tensor.fmatrix() A = tensor.fmatrix()
b = tensor.fmatrix() b = tensor.fmatrix()
o = slinalg.solve(A, b) o = slinalg.solve(A, b)
f_cpu = theano.function([A, b], o) f_cpu = theano.function([A, b], o, mode_without_gpu)
f_gpu = theano.function([A, b], o, mode=mode_with_gpu) f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Solve) assert not any(isinstance(n.op, slinalg.Solve)
for n in f_gpu.maker.fgraph.apply_nodes) for n in f_gpu.maker.fgraph.apply_nodes)
...@@ -584,6 +584,43 @@ def test_local_lift_solve(): ...@@ -584,6 +584,43 @@ def test_local_lift_solve():
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val)) utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def test_local_lift_cholesky():
if not cusolver_available:
raise SkipTest('No cuSolver')
A = tensor.fmatrix()
o = slinalg.cholesky(A)
f_cpu = theano.function([A], o, mode=mode_without_gpu)
f_gpu = theano.function([A], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Cholesky)
for n in f_gpu.maker.fgraph.apply_nodes)
# GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
assert any(isinstance(n.op, GpuCholesky) and n.op.inplace
for n in f_gpu.maker.fgraph.apply_nodes)
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def test_gpu_cholesky_not_inplace():
if not cusolver_available:
raise SkipTest('No cuSolver')
A = tensor.fmatrix()
A_squared = A**2
B = slinalg.cholesky(A_squared)
D = B + A_squared
f_cpu = theano.function([A], D, mode=mode_without_gpu)
f_gpu = theano.function([A], D, mode=mode_with_gpu)
# GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
count_cholesky_not_inplace = len([n.op for n in f_gpu.maker.fgraph.apply_nodes
if isinstance(n.op, GpuCholesky) and not n.op.inplace])
assert count_cholesky_not_inplace == 1, count_cholesky_not_inplace
M_val = np.random.normal(size=(3, 3)).astype("float32")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
def test_local_gpua_advanced_incsubtensor(): def test_local_gpua_advanced_incsubtensor():
# test a corner case reported at gh-5589 # test a corner case reported at gh-5589
target = tensor.ftensor4() target = tensor.ftensor4()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论