Merge pull request #5421 from tfjgeorge/cusolver

Cusolver using Cholesky decomposition for symmetric matrices

Merge pull request #5421 from tfjgeorge/cusolver
82901783 · Simon Lefrancois · GitHub · d5520e81 · 1b81f81e · 82901783
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -2,23 +2,54 @@ from __future__ import absolute_import, division, print_function

 import pkg_resources
 import theano
+import warnings

 from theano import Op
 from theano.gpuarray import basic_ops, GpuArrayType

+import numpy
+from numpy.linalg.linalg import LinAlgError
+
 try:
-    from pygpu import gpuarray
+    import pygpu
 except ImportError:
    pass

 cusolver_available = False
 try:
+    import skcuda
    from skcuda import cusolver
    cusolver_available = True
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass

-cusolver_handle = None
+if cusolver_available:
+    # Add cusolver call as it is missing in skcuda
+    # SPOTRS
+    cusolver._libcusolver.cusolverDnSpotrs.restype = int
+    cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p]
+
+    def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda,
+                         B, ldb, devInfo):
+        """
+        Solve real single precision linear system for hermitian matrices.
+        References
+        ----------
+        `cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
+        """
+
+        status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs,
+                                                        int(A), lda, int(B),
+                                                        ldb, int(devInfo))
+        cusolver.cusolverCheckStatus(status)


 class GpuCusolverSolve(Op):
@@ -32,20 +63,26 @@ class GpuCusolverSolve(Op):

    """

-    __props__ = ('trans',)
+    __props__ = ('A_structure', 'trans', 'inplace')

-    def __init__(self, trans='N', inplace=False):
+    def __init__(self, A_structure='general', trans='N', inplace=False):
        self.trans = trans
        self.inplace = inplace
+        self.A_structure = A_structure
        if self.inplace:
            self.destroy_map = {0: [0, 1]}
        super(GpuCusolverSolve, self).__init__()

    def make_node(self, inp1, inp2):
-        self.context = basic_ops.infer_context_name(inp1, inp2)
+        if not cusolver_available:
+            raise RuntimeError('CUSOLVER is not available and '
+                               'GpuCusolverSolve Op can not be constructed.')
+        if skcuda.__version__ <= '0.5.1':
+            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
+        context_name = basic_ops.infer_context_name(inp1, inp2)

-        inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
-        inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
+        inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
+        inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)

        inp1 = basic_ops.gpu_contiguous(inp1)
        inp2 = basic_ops.gpu_contiguous(inp2)
@@ -60,113 +97,118 @@ class GpuCusolverSolve(Op):
            self, [inp1, inp2],
            [GpuArrayType('float32',
                          broadcastable=inp1.broadcastable,
-                          context_name=self.context)()])
-
-    def make_thunk(self,
-                   node,
-                   storage_map, _,
-                   no_recycling=[],
-                   impl=None):
-        if not cusolver_available:
-            raise RuntimeError('CUSOLVER is not available and '
-                               'GpuCusolverSolve Op can not be constructed.')
+                          context_name=context_name)()])
+
+    def prepare_node(self, node, storage_map, compute_map, impl):
+        ctx = node.inputs[0].type.context
+        handle = getattr(ctx, 'cusolver_handle', None)
+        if handle is None:
+            with ctx:
+                ctx.cusolver_handle = cusolver.cusolverDnCreate()
+
+    def check_dev_info(self, dev_info):
+        val = numpy.asarray(dev_info)[0]
+        if val > 0:
+            raise LinAlgError('A is singular')
+
+    def perform(self, node, inputs, outputs):
+        context = inputs[0][0].context
+
+        # Size of the matrices to invert.
+        z = outputs[0]
+
+        # Matrix.
+        A = inputs[0]
+
+        # Solution vectors.
+        b = inputs[1]
+
+        assert(len(A.shape) == 2)
+        assert(len(b.shape) == 2)
+
+        if self.trans in ['T', 'C']:
+            trans = 1
+            l, n = A.shape
+            k, m = b.shape
+        elif self.trans == 'N':
+            trans = 0
+            n, l = A.shape
+            k, m = b.shape
+        else:
+            raise ValueError('Invalid value for trans')
+        if l != n:
+            raise ValueError('A must be a square matrix')
+        if n != k:
+            raise ValueError('A and b must be aligned.')
+
+        lda = max(1, n)
+        ldb = max(1, k)
+
+        # We copy A and b as cusolver operates inplace
+        b = pygpu.array(b, copy=True, order='F')
+        if not self.inplace:
+            A = pygpu.array(A, copy=True)
+        A_ptr = A.gpudata
+        b_ptr = b.gpudata
+
+        # cusolver expects a F ordered matrix, but A is not explicitly
+        # converted between C and F order, instead we switch the
+        # "transpose" flag.
+        if A.flags['C_CONTIGUOUS']:
+            trans = 1 - trans
+
+        if self.A_structure == 'symmetric':
+            with context:
+                workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
+                    context.cusolver_handle, 0, n, A_ptr, lda)
+
+            workspace = pygpu.zeros(workspace_size, dtype='float32',
+                                    context=context)
+
+            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
+
+            workspace_ptr = workspace.gpudata
+            dev_info_ptr = dev_info.gpudata
+
+            with context:
+                cusolver.cusolverDnSpotrf(
+                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
+                    workspace_size, dev_info_ptr)
+                self.check_dev_info(dev_info)
+
+                cusolverDnSpotrs(
+                    context.cusolver_handle, 0, n, m, A_ptr, lda,
+                    b_ptr, ldb, dev_info_ptr)
+
+        else:
+            # general case for A
+            with context:
+                workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
+                    context.cusolver_handle, n, n, A_ptr, lda)
+
+            workspace = pygpu.zeros(workspace_size, dtype='float32',
+                                    context=context)
+
+            pivots = pygpu.zeros(n, dtype='int32', context=context)
+
+            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
+
+            workspace_ptr = workspace.gpudata
+            pivots_ptr = pivots.gpudata
+            dev_info_ptr = dev_info.gpudata
+
+            with context:
+                cusolver.cusolverDnSgetrf(
+                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
+                    pivots_ptr, dev_info_ptr)
+                self.check_dev_info(dev_info)
+
+                cusolver.cusolverDnSgetrs(
+                    context.cusolver_handle, trans, n, m, A_ptr, lda,
+                    pivots_ptr, b_ptr, ldb, dev_info_ptr)
+
+        z[0] = b
+

-        inputs = [storage_map[v] for v in node.inputs]
-        outputs = [storage_map[v] for v in node.outputs]
-
-        global cusolver_handle
-        if cusolver_handle is None:
-            cusolver_handle = cusolver.cusolverDnCreate()
-
-        def thunk():
-            context = inputs[0][0].context
-
-            # Size of the matrices to invert.
-            z = outputs[0]
-
-            # Matrix.
-            A = inputs[0][0]
-
-            # Solution vectors.
-            b = inputs[1][0]
-
-            assert(len(A.shape) == 2)
-            assert(len(b.shape) == 2)
-
-            if self.trans in ['T', 'C']:
-                trans = 1
-                l, n = A.shape
-                k, m = b.shape
-            elif self.trans == 'N':
-                trans = 0
-                n, l = A.shape
-                k, m = b.shape
-            else:
-                raise ValueError('Invalid value for trans')
-            if l != n:
-                raise ValueError('A must be a square matrix')
-            if n != k:
-                raise ValueError('A and b must be aligned.')
-
-            lda = max(1, n)
-            ldb = max(1, k, m)
-
-            # We copy A and b as cusolver operates inplace
-            b = gpuarray.array(b, copy=True, order='F')
-            if not self.inplace:
-                A = gpuarray.array(A, copy=True)
-            A_ptr = A.gpudata
-            b_ptr = b.gpudata
-
-            # cusolver expects a F ordered matrix, but A is not explicitly
-            # converted between C and F order, instead we switch the
-            # "transpose" flag.
-            if A.flags['C_CONTIGUOUS']:
-                trans = 1 - trans
-
-            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
-                cusolver_handle, n, n, A_ptr, lda)
-
-            if (thunk.workspace is None or
-                    thunk.workspace.size != workspace_size):
-                thunk.workspace = gpuarray.zeros((workspace_size,),
-                                                 dtype='float32',
-                                                 context=context)
-
-            if thunk.pivots is None or thunk.pivots.size != min(n, n):
-                thunk.pivots = gpuarray.zeros((min(n, n),),
-                                              dtype='float32',
-                                              context=context)
-
-            if thunk.dev_info is None:
-                thunk.dev_info = gpuarray.zeros((1,),
-                                                dtype='float32',
-                                                context=context)
-
-            workspace_ptr = thunk.workspace.gpudata
-            pivots_ptr = thunk.pivots.gpudata
-            dev_info_ptr = thunk.dev_info.gpudata
-
-            cusolver.cusolverDnSgetrf(
-                cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
-                pivots_ptr, dev_info_ptr)
-
-            cusolver.cusolverDnSgetrs(
-                cusolver_handle, trans, n, m, A_ptr, lda,
-                pivots_ptr, b_ptr, ldb, dev_info_ptr)
-
-            z[0] = b
-
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-
-        thunk.workspace = None
-        thunk.pivots = None
-        thunk.dev_info = None
-
-        return thunk
-
-
-def gpu_solve(A, b, trans='N'):
-    return GpuCusolverSolve(trans)(A, b)
+def gpu_solve(A, b, A_structure='general', trans='N'):
+    return GpuCusolverSolve(A_structure, trans)(A, b)
--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -7,6 +7,8 @@ import theano
 from theano.tests import unittest_tools as utt
 from .config import mode_with_gpu

+from numpy.linalg.linalg import LinAlgError
+
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
@@ -16,7 +18,7 @@ if not cusolver_available:


 class TestCusolver(unittest.TestCase):
-    def run_gpu_solve(self, A_val, x_val):
+    def run_gpu_solve(self, A_val, x_val, A_struct=None):
        b_val = numpy.dot(A_val, x_val)
        b_val_trans = numpy.dot(A_val.T, x_val)

@@ -24,14 +26,19 @@ class TestCusolver(unittest.TestCase):
        b = theano.tensor.matrix("b", dtype="float32")
        b_trans = theano.tensor.matrix("b", dtype="float32")

-        solver = gpu_solve(A, b)
-        solver_trans = gpu_solve(A, b_trans, trans='T')
+        if A_struct is None:
+            solver = gpu_solve(A, b)
+            solver_trans = gpu_solve(A, b_trans, trans='T')
+        else:
+            solver = gpu_solve(A, b, A_struct)
+            solver_trans = gpu_solve(A, b_trans, A_struct, trans='T')
+
        fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu)
        res = fn(A_val, b_val, b_val_trans)
        x_res = numpy.array(res[0])
        x_res_trans = numpy.array(res[1])
-        utt.assert_allclose(x_res, x_val)
-        utt.assert_allclose(x_res_trans, x_val)
+        utt.assert_allclose(x_val, x_res)
+        utt.assert_allclose(x_val, x_res_trans)

    def test_diag_solve(self):
        numpy.random.seed(1)
@@ -41,13 +48,24 @@ class TestCusolver(unittest.TestCase):
                                     1)).astype("float32")
        self.run_gpu_solve(A_val, x_val)

+    def test_bshape_solve(self):
+        """
+        Test when shape of b (k, m) is such as m > k
+        """
+        numpy.random.seed(1)
+        A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
+                              dtype="float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     A_val.shape[1] + 1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
+
    def test_sym_solve(self):
        numpy.random.seed(1)
        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        A_sym = (A_val + A_val.T) / 2.0
+        A_sym = numpy.dot(A_val, A_val.T)
        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
                                     1)).astype("float32")
-        self.run_gpu_solve(A_sym, x_val)
+        self.run_gpu_solve(A_sym, x_val, 'symmetric')

    def test_orth_solve(self):
        numpy.random.seed(1)
@@ -63,3 +81,34 @@ class TestCusolver(unittest.TestCase):
        x_val = numpy.random.uniform(-0.4, 0.4,
                                     (A_val.shape[1], 4)).astype("float32")
        self.run_gpu_solve(A_val, x_val)
+
+    def test_linalgerrsym_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        A_val = numpy.dot(A_val.T, A_val)
+        # make A singular
+        A_val[:, 2] = A_val[:, 1] + A_val[:, 3]
+
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+        solver = gpu_solve(A, b, 'symmetric')
+
+        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
+        self.assertRaises(LinAlgError, fn, A_val, x_val)
+
+    def test_linalgerr_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        # make A singular
+        A_val[:, 2] = 0
+
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+        solver = gpu_solve(A, b, trans='T')
+
+        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
+        self.assertRaises(LinAlgError, fn, A_val, x_val)