cholesky work in progress

6d7e4eab · Thomas George · d49b5368 · 6d7e4eab · 6d7e4eab
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -19,6 +19,7 @@ cusolver_available = False
 try:
    import skcuda
    from skcuda import cusolver
+    from skcuda import linalg
    cusolver_available = True
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass
@@ -52,6 +53,13 @@ if cusolver_available:
        cusolver.cusolverCheckStatus(status)


+def attach_handle_to_context(ctx):
+    handle = getattr(ctx, 'cusolver_handle', None)
+    if handle is None:
+        with ctx:
+            ctx.cusolver_handle = cusolver.cusolverDnCreate()
+
+
 class GpuCusolverSolve(Op):
    """
    CUSOLVER GPU solver OP.
@@ -101,10 +109,7 @@ class GpuCusolverSolve(Op):

    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
-        handle = getattr(ctx, 'cusolver_handle', None)
-        if handle is None:
-            with ctx:
-                ctx.cusolver_handle = cusolver.cusolverDnCreate()
+        attach_handle_to_context(ctx)

    def check_dev_info(self, dev_info):
        val = np.asarray(dev_info)[0]
@@ -212,3 +217,114 @@ class GpuCusolverSolve(Op):

 def gpu_solve(A, b, A_structure='general', trans='N'):
    return GpuCusolverSolve(A_structure, trans)(A, b)
+
+
+class GpuCholesky(Op):
+    """
+    CUSOLVER GPU Cholesky Op.
+
+    Given a real positive definite matrix `A` returns either a lower
+    triangular matrix `L` such that `A == dot(L, L.T)` if `lower == True`
+    else returns an upper triangular matrix `U` such that `A == dot(U.T, U)`
+    if `lower == False`.
+
+    Parameters
+    ----------
+    lower
+        Whether to return a lower rather than upper triangular decomposition.
+
+    """
+
+    __props__ = ('lower', 'inplace')
+
+    def __init__(self, lower=True, inplace=False):
+        self.lower = lower
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0, 1]}
+        super(GpuCholesky, self).__init__()
+
+    def make_node(self, inp):
+        if not cusolver_available:
+            raise RuntimeError('CUSOLVER is not available and '
+                               'GpuCusolverSolve Op can not be constructed.')
+        if skcuda.__version__ <= '0.5.1':
+            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
+        context_name = basic_ops.infer_context_name(inp)
+
+        inp = basic_ops.as_gpuarray_variable(inp, context_name)
+
+        inp = basic_ops.gpu_contiguous(inp)
+
+        # this op can only operate on float32 matrices
+        assert inp.ndim == 2
+        assert inp.dtype == 'float32'
+
+        return theano.Apply(
+            self, [inp],
+            [GpuArrayType('float32',
+                          broadcastable=inp.broadcastable,
+                          context_name=context_name)()])
+
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        ctx = node.inputs[0].type.context
+        attach_handle_to_context(ctx)
+
+    def perform(self, node, inputs, outputs):
+        context = inputs[0][0].context
+
+        # Input matrix.
+        A = inputs[0]
+
+        assert(len(A.shape) == 2)
+
+        l, n = A.shape
+        if l != n:
+            raise ValueError('A must be a square matrix')
+
+        lda = max(1, n)
+
+        # cusolver operates on F ordered matrices, but A is expected
+        # to be symmetric so it does not matter.
+        # We copy A if needed
+        if self.inplace:
+            L = A
+        else:
+            L = pygpu.array(A, copy=True)
+
+        # The output matrix will contain only the upper or lower
+        # triangular factorization of A. If L is C ordered (it
+        # probably is as it is the default in Theano) we just switch
+        # the fill mode parameter of cusolver
+        l_parameter = 0 if self.lower else 1
+        if L.flags['C_CONTIGUOUS']:
+            l_parameter = 1 - l_parameter
+
+        L_ptr = L.gpudata
+
+        with context:
+            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
+                context.cusolver_handle, l_parameter, n, L_ptr, lda)
+
+        workspace = pygpu.zeros(workspace_size, dtype='float32',
+                                context=context)
+
+        dev_info = pygpu.zeros((1,), dtype='int32', context=context)
+
+        workspace_ptr = workspace.gpudata
+        dev_info_ptr = dev_info.gpudata
+
+        with context:
+            cusolver.cusolverDnSpotrf(
+                context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
+                workspace_size, dev_info_ptr)
+
+        # cusolver leaves the elements in the matrix outside the considered
+        # upper or lower triangle unchanged, so we need to put zeros outside
+        # the triangle
+        if self.lower:
+            linalg.tril(L, overwrite=True)
+        else:
+            linalg.triu(L, overwrite=True)
+
+        outputs[0][0] = L
--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -11,7 +11,7 @@ from numpy.linalg.linalg import LinAlgError

 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
+from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)

 if not cusolver_available:
    raise SkipTest('Optional package scikits.cuda.cusolver not available')
@@ -112,3 +112,105 @@ class TestCusolver(unittest.TestCase):

        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
        self.assertRaises(LinAlgError, fn, A_val, x_val)
+
+
+class TestGpuCholesky(unittest.TestCase):
+
+    def setUp(self):
+        utt.seed_rng()
+
+    def get_gpu_cholesky_func(self, lower=True, inplace=False):
+        """ Helper function to compile function from GPU Cholesky op. """
+        A = theano.tensor.matrix("A", dtype="float32")
+        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
+        chol_A = cholesky_op(A)
+        return theano.function([A], chol_A, accept_inplace=inplace)
+
+    def compare_gpu_cholesky_to_numpy(self, A_val, lower=True, inplace=False):
+        """ Helper function to compare op output to numpy.cholesky output. """
+        chol_A_val = numpy.linalg.cholesky(A_val)
+        if not lower:
+            chol_A_val = chol_A_val.T
+        fn = self.get_gpu_cholesky_func(lower, inplace)
+        res = fn(A_val)
+        chol_A_res = numpy.array(res)
+        utt.assert_allclose(chol_A_res, chol_A_val)
+
+    def test_invalid_input_fail_non_square(self):
+        """ Invalid Cholesky input test with non-square matrix as input. """
+        A_val = numpy.random.normal(size=(3, 2)).astype("float32")
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(ValueError, fn, A_val)
+
+    def test_invalid_input_fail_non_symmetric(self):
+        pass
+        """ Invalid Cholesky input test with non-symmetric input.
+            (Non-symmetric real input must also be non-positive definite). """
+        A_val = numpy.random.normal(size=(3, 3)).astype("float32")
+        # double-check random A_val is asymmetric - the probability of this
+        # not being the case even with finite precision should be negligible
+        assert not numpy.allclose(A_val, A_val.T)
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(cula.cula.culaError, fn, A_val)
+
+    def test_invalid_input_fail_negative_definite(self):
+        """ Invalid Cholesky input test with negative-definite input. """
+        M_val = numpy.random.normal(size=(3, 3)).astype("float32")
+        # A = -M.dot(M) will be negative definite for all non-singular M
+        A_val = -M_val.dot(M_val.T)
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(cula.cula.culaError, fn, A_val)
+
+    def test_invalid_input_fail_vector(self):
+        """ Invalid Cholesky input test with vector as input. """
+        def invalid_input_func():
+            A = theano.tensor.vector("A", dtype="float32")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+
+    def test_invalid_input_fail_tensor3(self):
+        """ Invalid Cholesky input test with 3D tensor as input. """
+        def invalid_input_func():
+            A = theano.tensor.tensor3("A", dtype="float32")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+
+    def test_diag_chol(self):
+        """ Diagonal matrix input Cholesky test. """
+        # make sure all diagonal elements are positive so positive-definite
+        A_val = numpy.diag(numpy.random.uniform(size=5).astype("float32") + 1)
+        self.compare_gpu_cholesky_to_numpy(A_val, lower=True, inplace=False)
+
+    def test_dense_chol_lower(self):
+        """ Dense matrix input lower-triangular Cholesky test. """
+        M_val = numpy.random.normal(size=(3, 3)).astype("float32")
+        # A = M.dot(M) will be positive definite for all non-singular M
+        A_val = M_val.dot(M_val.T)
+        self.compare_gpu_cholesky_to_numpy(A_val, lower=True, inplace=False)
+
+    def test_dense_chol_upper(self):
+        """ Dense matrix input upper-triangular Cholesky test. """
+        M_val = numpy.random.normal(size=(3, 3)).astype("float32")
+        # A = M.dot(M) will be positive definite for all non-singular M
+        A_val = M_val.dot(M_val.T)
+        self.compare_gpu_cholesky_to_numpy(A_val, lower=False, inplace=False)
+
+    def test_diag_chol_inplace(self):
+        """ Diagonal matrix input inplace Cholesky test. """
+        # make sure all diagonal elements are positive so positive-definite
+        A_val = numpy.diag(numpy.random.uniform(size=5).astype("float32") + 1)
+        self.compare_gpu_cholesky_to_numpy(A_val, lower=True, inplace=True)
+
+    def test_dense_chol_lower_inplace(self):
+        """ Dense matrix input lower-triangular inplace Cholesky test. """
+        M_val = numpy.random.normal(size=(3, 3)).astype("float32")
+        # A = M.dot(M) will be positive definite for all non-singular M
+        A_val = M_val.dot(M_val.T)
+        self.compare_gpu_cholesky_to_numpy(A_val, lower=True, inplace=True)
+
+    def test_dense_chol_upper_inplace(self):
+        """ Dense matrix input upper-triangular inplace Cholesky test. """
+        M_val = numpy.random.normal(size=(3, 3)).astype("float32")
+        # A = M.dot(M) will be positive definite for all non-singular M
+        A_val = M_val.dot(M_val.T)
+        self.compare_gpu_cholesky_to_numpy(A_val, lower=False, inplace=True)