Merge pull request #5421 from tfjgeorge/cusolver

Cusolver using Cholesky decomposition for symmetric matrices

Merge pull request #5421 from tfjgeorge/cusolver
82901783 · Simon Lefrancois · GitHub · d5520e81 · 1b81f81e · 82901783
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -2,23 +2,54 @@ from __future__ import absolute_import, division, print_function
 import pkg_resources
 import theano
+import warnings
 from theano import Op
 from theano.gpuarray import basic_ops, GpuArrayType
+import numpy
+from numpy.linalg.linalg import LinAlgError
 try:
-    from pygpu import gpuarray
+    import pygpu
 except ImportError:
    pass
 cusolver_available = False
 try:
+    import skcuda
    from skcuda import cusolver
    cusolver_available = True
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass
-cusolver_handle = None
+if cusolver_available:
+    # Add cusolver call as it is missing in skcuda
+    # SPOTRS
+    cusolver._libcusolver.cusolverDnSpotrs.restype = int
+    cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p]
+    def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda,
+                         B, ldb, devInfo):
+        """
+        Solve real single precision linear system for hermitian matrices.
+        References
+        ----------
+        `cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
+        """
+        status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs,
+                                                        int(A), lda, int(B),
+                                                        ldb, int(devInfo))
+        cusolver.cusolverCheckStatus(status)
 class GpuCusolverSolve(Op):
@@ -32,20 +63,26 @@ class GpuCusolverSolve(Op):
    """
-    __props__ = ('trans',)
+    __props__ = ('A_structure', 'trans', 'inplace')
-    def __init__(self, trans='N', inplace=False):
+    def __init__(self, A_structure='general', trans='N', inplace=False):
        self.trans = trans
        self.inplace = inplace
+        self.A_structure = A_structure
        if self.inplace:
            self.destroy_map = {0: [0, 1]}
        super(GpuCusolverSolve, self).__init__()
    def make_node(self, inp1, inp2):
-        self.context = basic_ops.infer_context_name(inp1, inp2)
+        if not cusolver_available:
+            raise RuntimeError('CUSOLVER is not available and '
+                               'GpuCusolverSolve Op can not be constructed.')
+        if skcuda.__version__ <= '0.5.1':
+            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
+        context_name = basic_ops.infer_context_name(inp1, inp2)
-        inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
+        inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
-        inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
+        inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)
        inp1 = basic_ops.gpu_contiguous(inp1)
        inp2 = basic_ops.gpu_contiguous(inp2)
@@ -60,35 +97,31 @@ class GpuCusolverSolve(Op):
            self, [inp1, inp2],
            [GpuArrayType('float32',
                          broadcastable=inp1.broadcastable,
-                          context_name=self.context)()])
+                          context_name=context_name)()])
-    def make_thunk(self,
-                   node,
-                   storage_map, _,
-                   no_recycling=[],
-                   impl=None):
-        if not cusolver_available:
-            raise RuntimeError('CUSOLVER is not available and '
-                               'GpuCusolverSolve Op can not be constructed.')
-        inputs = [storage_map[v] for v in node.inputs]
+    def prepare_node(self, node, storage_map, compute_map, impl):
-        outputs = [storage_map[v] for v in node.outputs]
+        ctx = node.inputs[0].type.context
+        handle = getattr(ctx, 'cusolver_handle', None)
+        if handle is None:
+            with ctx:
+                ctx.cusolver_handle = cusolver.cusolverDnCreate()
-        global cusolver_handle
+    def check_dev_info(self, dev_info):
-        if cusolver_handle is None:
+        val = numpy.asarray(dev_info)[0]
-            cusolver_handle = cusolver.cusolverDnCreate()
+        if val > 0:
+            raise LinAlgError('A is singular')
-        def thunk():
+    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context
        # Size of the matrices to invert.
        z = outputs[0]
        # Matrix.
-            A = inputs[0][0]
+        A = inputs[0]
        # Solution vectors.
-            b = inputs[1][0]
+        b = inputs[1]
        assert(len(A.shape) == 2)
        assert(len(b.shape) == 2)
@@ -109,12 +142,12 @@ class GpuCusolverSolve(Op):
            raise ValueError('A and b must be aligned.')
        lda = max(1, n)
-            ldb = max(1, k, m)
+        ldb = max(1, k)
        # We copy A and b as cusolver operates inplace
-            b = gpuarray.array(b, copy=True, order='F')
+        b = pygpu.array(b, copy=True, order='F')
        if not self.inplace:
-                A = gpuarray.array(A, copy=True)
+            A = pygpu.array(A, copy=True)
        A_ptr = A.gpudata
        b_ptr = b.gpudata
@@ -124,49 +157,58 @@ class GpuCusolverSolve(Op):
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans
-            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
+        if self.A_structure == 'symmetric':
-                cusolver_handle, n, n, A_ptr, lda)
+            with context:
+                workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
+                    context.cusolver_handle, 0, n, A_ptr, lda)
-            if (thunk.workspace is None or
+            workspace = pygpu.zeros(workspace_size, dtype='float32',
-                    thunk.workspace.size != workspace_size):
-                thunk.workspace = gpuarray.zeros((workspace_size,),
-                                                 dtype='float32',
                                    context=context)
-            if thunk.pivots is None or thunk.pivots.size != min(n, n):
+            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
-                thunk.pivots = gpuarray.zeros((min(n, n),),
-                                              dtype='float32',
+            workspace_ptr = workspace.gpudata
-                                              context=context)
+            dev_info_ptr = dev_info.gpudata
+            with context:
+                cusolver.cusolverDnSpotrf(
+                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
+                    workspace_size, dev_info_ptr)
+                self.check_dev_info(dev_info)
+                cusolverDnSpotrs(
+                    context.cusolver_handle, 0, n, m, A_ptr, lda,
+                    b_ptr, ldb, dev_info_ptr)
+        else:
+            # general case for A
+            with context:
+                workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
+                    context.cusolver_handle, n, n, A_ptr, lda)
-            if thunk.dev_info is None:
+            workspace = pygpu.zeros(workspace_size, dtype='float32',
-                thunk.dev_info = gpuarray.zeros((1,),
-                                                dtype='float32',
                                    context=context)
-            workspace_ptr = thunk.workspace.gpudata
+            pivots = pygpu.zeros(n, dtype='int32', context=context)
-            pivots_ptr = thunk.pivots.gpudata
-            dev_info_ptr = thunk.dev_info.gpudata
+            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
+            workspace_ptr = workspace.gpudata
+            pivots_ptr = pivots.gpudata
+            dev_info_ptr = dev_info.gpudata
+            with context:
                cusolver.cusolverDnSgetrf(
-                cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
+                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                    pivots_ptr, dev_info_ptr)
+                self.check_dev_info(dev_info)
                cusolver.cusolverDnSgetrs(
-                cusolver_handle, trans, n, m, A_ptr, lda,
+                    context.cusolver_handle, trans, n, m, A_ptr, lda,
                    pivots_ptr, b_ptr, ldb, dev_info_ptr)
        z[0] = b
-        thunk.inputs = inputs
-        thunk.outputs = outputs
-        thunk.lazy = False
-        thunk.workspace = None
-        thunk.pivots = None
-        thunk.dev_info = None
-        return thunk
-def gpu_solve(A, b, trans='N'):
+def gpu_solve(A, b, A_structure='general', trans='N'):
-    return GpuCusolverSolve(trans)(A, b)
+    return GpuCusolverSolve(A_structure, trans)(A, b)
--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -7,6 +7,8 @@ import theano
 from theano.tests import unittest_tools as utt
 from .config import mode_with_gpu
+from numpy.linalg.linalg import LinAlgError
 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
 from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
@@ -16,7 +18,7 @@ if not cusolver_available:
 class TestCusolver(unittest.TestCase):
-    def run_gpu_solve(self, A_val, x_val):
+    def run_gpu_solve(self, A_val, x_val, A_struct=None):
        b_val = numpy.dot(A_val, x_val)
        b_val_trans = numpy.dot(A_val.T, x_val)
@@ -24,14 +26,19 @@ class TestCusolver(unittest.TestCase):
        b = theano.tensor.matrix("b", dtype="float32")
        b_trans = theano.tensor.matrix("b", dtype="float32")
+        if A_struct is None:
            solver = gpu_solve(A, b)
            solver_trans = gpu_solve(A, b_trans, trans='T')
+        else:
+            solver = gpu_solve(A, b, A_struct)
+            solver_trans = gpu_solve(A, b_trans, A_struct, trans='T')
        fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu)
        res = fn(A_val, b_val, b_val_trans)
        x_res = numpy.array(res[0])
        x_res_trans = numpy.array(res[1])
-        utt.assert_allclose(x_res, x_val)
+        utt.assert_allclose(x_val, x_res)
-        utt.assert_allclose(x_res_trans, x_val)
+        utt.assert_allclose(x_val, x_res_trans)
    def test_diag_solve(self):
        numpy.random.seed(1)
@@ -41,13 +48,24 @@ class TestCusolver(unittest.TestCase):
                                     1)).astype("float32")
        self.run_gpu_solve(A_val, x_val)
+    def test_bshape_solve(self):
+        """
+        Test when shape of b (k, m) is such as m > k
+        """
+        numpy.random.seed(1)
+        A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
+                              dtype="float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     A_val.shape[1] + 1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
    def test_sym_solve(self):
        numpy.random.seed(1)
        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        A_sym = (A_val + A_val.T) / 2.0
+        A_sym = numpy.dot(A_val, A_val.T)
        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
                                     1)).astype("float32")
-        self.run_gpu_solve(A_sym, x_val)
+        self.run_gpu_solve(A_sym, x_val, 'symmetric')
    def test_orth_solve(self):
        numpy.random.seed(1)
@@ -63,3 +81,34 @@ class TestCusolver(unittest.TestCase):
        x_val = numpy.random.uniform(-0.4, 0.4,
                                     (A_val.shape[1], 4)).astype("float32")
        self.run_gpu_solve(A_val, x_val)
+    def test_linalgerrsym_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        A_val = numpy.dot(A_val.T, A_val)
+        # make A singular
+        A_val[:, 2] = A_val[:, 1] + A_val[:, 3]
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+        solver = gpu_solve(A, b, 'symmetric')
+        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
+        self.assertRaises(LinAlgError, fn, A_val, x_val)
+    def test_linalgerr_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        # make A singular
+        A_val[:, 2] = 0
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+        solver = gpu_solve(A, b, trans='T')
+        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
+        self.assertRaises(LinAlgError, fn, A_val, x_val)