Merge pull request #5799 from tfjgeorge/gpu_cholesky

Gpu cholesky

Merge pull request #5799 from tfjgeorge/gpu_cholesky
06addd9c · Frédéric Bastien · GitHub · 1191635b · cf4e0264 · 06addd9c
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -12,8 +12,10 @@ from numpy.linalg.linalg import LinAlgError

 try:
    import pygpu
+    from pygpu.basic import triu, tril
+    pygpu_available = True
 except ImportError:
-    pass
+    pygpu_available = False

 cusolver_available = False
 try:
@@ -52,6 +54,13 @@ if cusolver_available:
        cusolver.cusolverCheckStatus(status)


+def attach_cusolver_handle_to_context(ctx):
+    handle = getattr(ctx, 'cusolver_handle', None)
+    if handle is None:
+        with ctx:
+            ctx.cusolver_handle = cusolver.cusolverDnCreate()
+
+
 class GpuCusolverSolve(Op):
    """
    CUSOLVER GPU solver OP.
@@ -101,10 +110,7 @@ class GpuCusolverSolve(Op):

    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
-        handle = getattr(ctx, 'cusolver_handle', None)
-        if handle is None:
-            with ctx:
-                ctx.cusolver_handle = cusolver.cusolverDnCreate()
+        attach_cusolver_handle_to_context(ctx)

    def check_dev_info(self, dev_info):
        val = np.asarray(dev_info)[0]
@@ -212,3 +218,120 @@ class GpuCusolverSolve(Op):

 def gpu_solve(A, b, A_structure='general', trans='N'):
    return GpuCusolverSolve(A_structure, trans)(A, b)
+
+
+class GpuCholesky(Op):
+    """
+    CUSOLVER GPU Cholesky Op.
+
+    Given a real positive definite matrix `A` returns either a lower
+    triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
+    else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
+    if `lower == False`.
+
+    Parameters
+    ----------
+    lower
+        Whether to return a lower rather than upper triangular decomposition.
+
+    """
+
+    __props__ = ('lower', 'inplace')
+
+    def __init__(self, lower=True, inplace=False):
+        self.lower = lower
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+        super(GpuCholesky, self).__init__()
+
+    def make_node(self, inp):
+        if not cusolver_available:
+            raise RuntimeError('CUSOLVER is not available and '
+                               'GpuCholesky Op can not be constructed.')
+        if skcuda.__version__ <= '0.5.1':
+            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
+        if not pygpu_available:
+            raise RuntimeError('Missing pygpu or triu/tril functions.'
+                               'Install or update libgpuarray.')
+        context_name = basic_ops.infer_context_name(inp)
+
+        inp = basic_ops.as_gpuarray_variable(inp, context_name)
+
+        inp = basic_ops.gpu_contiguous(inp)
+
+        # this op can only operate on float32 matrices
+        # because of current implementation of triu/tril.
+        # TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
+        assert inp.ndim == 2
+        assert inp.dtype == 'float32'
+
+        return theano.Apply(self, [inp], [inp.type()])
+
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        ctx = node.inputs[0].type.context
+        attach_cusolver_handle_to_context(ctx)
+
+    def perform(self, node, inputs, outputs):
+        context = inputs[0][0].context
+
+        # Input matrix.
+        A = inputs[0]
+
+        l, n = A.shape
+        if l != n:
+            raise ValueError('A must be a square matrix')
+
+        lda = max(1, n)
+
+        # cusolver operates on F ordered matrices, but A is expected
+        # to be symmetric so it does not matter.
+        # We copy A if needed
+        if self.inplace:
+            L = A
+        else:
+            L = pygpu.array(A, copy=True)
+
+        # The output matrix will contain only the upper or lower
+        # triangular factorization of A. If L is C ordered (it
+        # probably is as it is the default in Theano) we just switch
+        # the fill mode parameter of cusolver
+        l_parameter = 0 if self.lower else 1
+        if L.flags['C_CONTIGUOUS']:
+            l_parameter = 1 - l_parameter
+
+        L_ptr = L.gpudata
+
+        with context:
+            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
+                context.cusolver_handle, l_parameter, n, L_ptr, lda)
+
+            workspace = pygpu.zeros(workspace_size, dtype='float32',
+                                    context=context)
+
+            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
+
+            workspace_ptr = workspace.gpudata
+            dev_info_ptr = dev_info.gpudata
+
+            cusolver.cusolverDnSpotrf(
+                context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
+                workspace_size, dev_info_ptr)
+
+            val_dev_info = np.asarray(dev_info)[0]
+            if val_dev_info > 0:
+                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
+
+        # cusolver leaves the elements in the matrix outside the considered
+        # upper or lower triangle unchanged, so we need to put zeros outside
+        # the triangle
+        if self.lower:
+            tril(L)
+        else:
+            triu(L)
+
+        outputs[0][0] = L
+
+
+def gpu_cholesky(A, lower=True):
+    return GpuCholesky(lower)(A)
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -70,7 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
-from .linalg import (GpuCusolverSolve, cusolver_available)
+from .linalg import (GpuCusolverSolve, GpuCholesky, cusolver_available)

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -1967,6 +1967,23 @@ def local_gpu_solve(op, context_name, inputs, outputs):
        return
    return GpuCusolverSolve()

+
+# Cholesky decomposition
+@register_opt('fast_compile')
+@op_lifter([slinalg.Cholesky])
+@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
+def local_gpu_cholesky(op, context_name, inputs, outputs):
+    if not cusolver_available:
+        return
+    return GpuCholesky(lower=op.lower, inplace=op.destructive)
+
+
+@register_inplace()
+@local_optimizer([GpuCholesky], inplace=True)
+def local_inplace_cholesky(node):
+    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
+        return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
+
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -11,7 +11,7 @@ from numpy.linalg.linalg import LinAlgError

 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
+from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)

 if not cusolver_available:
    raise SkipTest('Optional package scikits.cuda.cusolver not available')
@@ -112,3 +112,82 @@ class TestCusolver(unittest.TestCase):

        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
        self.assertRaises(LinAlgError, fn, A_val, x_val)
+
+
+class TestGpuCholesky(unittest.TestCase):
+
+    def setUp(self):
+        utt.seed_rng()
+
+    def get_gpu_cholesky_func(self, lower=True, inplace=False):
+        # Helper function to compile function from GPU Cholesky op.
+        A = theano.tensor.matrix("A", dtype="float32")
+        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
+        chol_A = cholesky_op(A)
+        return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
+
+    def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
+        # Helper function to compare op output to np.cholesky output.
+        chol_A_val = np.linalg.cholesky(A_val)
+        if not lower:
+            chol_A_val = chol_A_val.T
+        fn = self.get_gpu_cholesky_func(lower, inplace)
+        res = fn(A_val)
+        chol_A_res = np.array(res)
+        utt.assert_allclose(chol_A_res, chol_A_val)
+
+    def test_invalid_input_fail_non_square(self):
+        # Invalid Cholesky input test with non-square matrix as input.
+        A_val = np.random.normal(size=(3, 2)).astype("float32")
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(ValueError, fn, A_val)
+
+    def test_invalid_input_fail_vector(self):
+        # Invalid Cholesky input test with vector as input.
+        def invalid_input_func():
+            A = theano.tensor.vector("A", dtype="float32")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+
+    def test_invalid_input_fail_tensor3(self):
+        # Invalid Cholesky input test with 3D tensor as input.
+        def invalid_input_func():
+            A = theano.tensor.tensor3("A", dtype="float32")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+
+    def test_diag_chol(self):
+        # Diagonal matrix input Cholesky test.
+        for lower in [True, False]:
+            for inplace in [True, False]:
+                # make sure all diagonal elements are positive so positive-definite
+                A_val = np.diag(np.random.uniform(size=5).astype("float32") + 1)
+                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
+
+    def test_dense_chol_lower(self):
+        # Dense matrix input lower-triangular Cholesky test.
+        for lower in [True, False]:
+            for inplace in [True, False]:
+                M_val = np.random.normal(size=(3, 3)).astype("float32")
+                # A = M.dot(M) will be positive definite for all non-singular M
+                A_val = M_val.dot(M_val.T)
+                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
+
+    def test_invalid_input_fail_non_symmetric(self):
+        # Invalid Cholesky input test with non-symmetric input.
+        #    (Non-symmetric real input must also be non-positive definite).
+        A_val = None
+        while True:
+            A_val = np.random.normal(size=(3, 3)).astype("float32")
+            if not np.allclose(A_val, A_val.T):
+                break
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(LinAlgError, fn, A_val)
+
+    def test_invalid_input_fail_negative_definite(self):
+        # Invalid Cholesky input test with negative-definite input.
+        M_val = np.random.normal(size=(3, 3)).astype("float32")
+        # A = -M.dot(M) will be negative definite for all non-singular M
+        A_val = -M_val.dot(M_val.T)
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(LinAlgError, fn, A_val)
--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -17,7 +17,7 @@ from ..basic_ops import (
 from ..blas import GpuGemm
 from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
 from ..subtensor import GpuSubtensor
-from ..linalg import GpuCusolverSolve, cusolver_available
+from ..linalg import GpuCusolverSolve, cusolver_available, GpuCholesky

 from .config import mode_with_gpu, mode_without_gpu, test_ctx_name, SkipTest

@@ -573,7 +573,7 @@ def test_local_lift_solve():
    A = tensor.fmatrix()
    b = tensor.fmatrix()
    o = slinalg.solve(A, b)
-    f_cpu = theano.function([A, b], o)
+    f_cpu = theano.function([A, b], o, mode_without_gpu)
    f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
    assert not any(isinstance(n.op, slinalg.Solve)
                   for n in f_gpu.maker.fgraph.apply_nodes)
@@ -584,6 +584,43 @@ def test_local_lift_solve():
    utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))


+def test_local_lift_cholesky():
+    if not cusolver_available:
+        raise SkipTest('No cuSolver')
+    A = tensor.fmatrix()
+    o = slinalg.cholesky(A)
+    f_cpu = theano.function([A], o, mode=mode_without_gpu)
+    f_gpu = theano.function([A], o, mode=mode_with_gpu)
+    assert not any(isinstance(n.op, slinalg.Cholesky)
+                   for n in f_gpu.maker.fgraph.apply_nodes)
+    # GpuCholesky op in this graph should be inplace (as his input is not reused by other op).
+    assert any(isinstance(n.op, GpuCholesky) and n.op.inplace
+               for n in f_gpu.maker.fgraph.apply_nodes)
+    M_val = np.random.normal(size=(3, 3)).astype("float32")
+    # A = M.dot(M) will be positive definite for all non-singular M
+    A_val = M_val.dot(M_val.T)
+    utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
+
+
+def test_gpu_cholesky_not_inplace():
+    if not cusolver_available:
+        raise SkipTest('No cuSolver')
+    A = tensor.fmatrix()
+    A_squared = A**2
+    B = slinalg.cholesky(A_squared)
+    D = B + A_squared
+    f_cpu = theano.function([A], D, mode=mode_without_gpu)
+    f_gpu = theano.function([A], D, mode=mode_with_gpu)
+    # GpuCholesky op in this graph should NOT be inplace (as his input is reused in another op)
+    count_cholesky_not_inplace = len([n.op for n in f_gpu.maker.fgraph.apply_nodes
+                                      if isinstance(n.op, GpuCholesky) and not n.op.inplace])
+    assert count_cholesky_not_inplace == 1, count_cholesky_not_inplace
+    M_val = np.random.normal(size=(3, 3)).astype("float32")
+    # A = M.dot(M) will be positive definite for all non-singular M
+    A_val = M_val.dot(M_val.T)
+    utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
+
+
 def test_local_gpua_advanced_incsubtensor():
    # test a corner case reported at gh-5589
    target = tensor.ftensor4()