Merge pull request #5320 from tfjgeorge/cusolver-ccw

Gpu op for solve using cusolver (CCW #5191)

Merge pull request #5320 from tfjgeorge/cusolver-ccw
276b9e51 · Frédéric Bastien · GitHub · 986c6dc6 · d825302f · 276b9e51
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
+from __future__ import absolute_import, division, print_function
+import pkg_resources
+import theano
+from theano import Op
+from theano.gpuarray import basic_ops, GpuArrayType
+try:
+    from pygpu import gpuarray
+except ImportError:
+    pass
+cusolver_available = False
+try:
+    from skcuda import cusolver
+    cusolver_available = True
+except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
+    pass
+cusolver_handle = None
+class GpuCusolverSolve(Op):
+    """
+    CUSOLVER GPU solver OP.
+    Parameters
+    ----------
+    trans
+        Whether to take the transpose of the input matrix or not.
+    """
+    __props__ = ('trans',)
+    def __init__(self, trans='N', inplace=False):
+        self.trans = trans
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0, 1]}
+        super(GpuCusolverSolve, self).__init__()
+    def make_node(self, inp1, inp2):
+        self.context = basic_ops.infer_context_name(inp1, inp2)
+        inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
+        inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
+        inp1 = basic_ops.gpu_contiguous(inp1)
+        inp2 = basic_ops.gpu_contiguous(inp2)
+        # this op can only operate on float32 matrices
+        assert inp1.ndim == 2
+        assert inp2.ndim == 2
+        assert inp1.dtype == 'float32'
+        assert inp2.dtype == 'float32'
+        return theano.Apply(
+            self, [inp1, inp2],
+            [GpuArrayType('float32',
+                          broadcastable=inp1.broadcastable,
+                          context_name=self.context)()])
+    def make_thunk(self,
+                   node,
+                   storage_map, _,
+                   no_recycling=[],
+                   impl=None):
+        if not cusolver_available:
+            raise RuntimeError('CUSOLVER is not available and '
+                               'GpuCusolverSolve Op can not be constructed.')
+        inputs = [storage_map[v] for v in node.inputs]
+        outputs = [storage_map[v] for v in node.outputs]
+        global cusolver_handle
+        if cusolver_handle is None:
+            cusolver_handle = cusolver.cusolverDnCreate()
+        def thunk():
+            context = inputs[0][0].context
+            # Size of the matrices to invert.
+            z = outputs[0]
+            # Matrix.
+            A = inputs[0][0]
+            # Solution vectors.
+            b = inputs[1][0]
+            assert(len(A.shape) == 2)
+            assert(len(b.shape) == 2)
+            if self.trans in ['T', 'C']:
+                trans = 1
+                l, n = A.shape
+                k, m = b.shape
+            elif self.trans == 'N':
+                trans = 0
+                n, l = A.shape
+                k, m = b.shape
+            else:
+                raise ValueError('Invalid value for trans')
+            if l != n:
+                raise ValueError('A must be a square matrix')
+            if n != k:
+                raise ValueError('A and b must be aligned.')
+            lda = max(1, n)
+            ldb = max(1, k, m)
+            # We copy A and b as cusolver operates inplace
+            b = gpuarray.array(b, copy=True, order='F')
+            if not self.inplace:
+                A = gpuarray.array(A, copy=True)
+            A_ptr = A.gpudata
+            b_ptr = b.gpudata
+            # cusolver expects a F ordered matrix, but A is not explicitly
+            # converted between C and F order, instead we switch the
+            # "transpose" flag.
+            if A.flags['C_CONTIGUOUS']:
+                trans = 1 - trans
+            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
+                cusolver_handle, n, n, A_ptr, lda)
+            if (thunk.workspace is None or
+                    thunk.workspace.size != workspace_size):
+                thunk.workspace = gpuarray.zeros((workspace_size,),
+                                                 dtype='float32',
+                                                 context=context)
+            if thunk.pivots is None or thunk.pivots.size != min(n, n):
+                thunk.pivots = gpuarray.zeros((min(n, n),),
+                                              dtype='float32',
+                                              context=context)
+            if thunk.dev_info is None:
+                thunk.dev_info = gpuarray.zeros((1,),
+                                                dtype='float32',
+                                                context=context)
+            workspace_ptr = thunk.workspace.gpudata
+            pivots_ptr = thunk.pivots.gpudata
+            dev_info_ptr = thunk.dev_info.gpudata
+            cusolver.cusolverDnSgetrf(
+                cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
+                pivots_ptr, dev_info_ptr)
+            cusolver.cusolverDnSgetrs(
+                cusolver_handle, trans, n, m, A_ptr, lda,
+                pivots_ptr, b_ptr, ldb, dev_info_ptr)
+            z[0] = b
+        thunk.inputs = inputs
+        thunk.outputs = outputs
+        thunk.lazy = False
+        thunk.workspace = None
+        thunk.pivots = None
+        thunk.dev_info = None
+        return thunk
+def gpu_solve(A, b, trans='N'):
+    return GpuCusolverSolve(trans)(A, b)
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv3d_gradWeights,
                                              AbstractConv3d_gradInputs)
 import theano.tensor.signal.pool as pool
+import theano.tensor.slinalg as slinalg
 from theano.tests.breakpoint import PdbBreakpoint
@@ -69,6 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1_dev20)
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
+from .linalg import (GpuCusolverSolve, cusolver_available)
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -1911,6 +1913,16 @@ def _scan_type_infer(node):
 def local_gpu_maxandargmax(op, context_name, inputs, outputs):
    return GpuMaxAndArgmax(op.get_params(None))
+# solve
+@register_opt('fast_compile')
+@op_lifter([slinalg.Solve])
+@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
+def local_gpu_solve(op, context_name, inputs, outputs):
+    if not cusolver_available:
+        return
+    return GpuCusolverSolve()
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
+from __future__ import absolute_import, division, print_function
+import unittest
+import numpy
+import theano
+from theano.tests import unittest_tools as utt
+from .config import mode_with_gpu
+# Skip tests if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
+if not cusolver_available:
+    raise SkipTest('Optional package scikits.cuda.cusolver not available')
+class TestCusolver(unittest.TestCase):
+    def run_gpu_solve(self, A_val, x_val):
+        b_val = numpy.dot(A_val, x_val)
+        b_val_trans = numpy.dot(A_val.T, x_val)
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+        b_trans = theano.tensor.matrix("b", dtype="float32")
+        solver = gpu_solve(A, b)
+        solver_trans = gpu_solve(A, b_trans, trans='T')
+        fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu)
+        res = fn(A_val, b_val, b_val_trans)
+        x_res = numpy.array(res[0])
+        x_res_trans = numpy.array(res[1])
+        utt.assert_allclose(x_res, x_val)
+        utt.assert_allclose(x_res_trans, x_val)
+    def test_diag_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
+                              dtype="float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
+    def test_sym_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_sym = (A_val + A_val.T) / 2.0
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_sym, x_val)
+    def test_orth_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_orth = numpy.linalg.svd(A_val)[0]
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_orth, x_val)
+    def test_uni_rand_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
--- a/theano/gpuarray/tests/test_opt.py
+++ b/theano/gpuarray/tests/test_opt.py
@@ -4,6 +4,7 @@ from nose.tools import assert_raises
 import theano
 from theano import tensor
+import theano.tensor.slinalg as slinalg
 from theano.tests.breakpoint import PdbBreakpoint
 from theano.tests import unittest_tools as utt, test_ifelse
 from theano.tensor.tests import test_basic
@@ -16,6 +17,7 @@ from ..basic_ops import (
 from ..blas import GpuGemm
 from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
 from ..subtensor import GpuSubtensor
+from ..linalg import GpuCusolverSolve
 from .config import mode_with_gpu, test_ctx_name
@@ -496,3 +498,18 @@ def test_no_complex():
    stft_out = tensor.exp(width_var * freq_var) * signal_var
    theano.function([width_var, freq_var, signal_var], stft_out,
                    mode=mode_with_gpu)
+def test_local_lift_solve():
+    A = tensor.fmatrix()
+    b = tensor.fmatrix()
+    o = slinalg.solve(A, b)
+    f_cpu = theano.function([A, b], o)
+    f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
+    assert not any(isinstance(n.op, slinalg.Solve)
+                   for n in f_gpu.maker.fgraph.apply_nodes)
+    assert any(isinstance(n.op, GpuCusolverSolve)
+               for n in f_gpu.maker.fgraph.apply_nodes)
+    A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+    b_val = numpy.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
+    utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -49,7 +49,6 @@ from theano.sandbox.cuda.blas import (
    GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
-from theano.sandbox.cuda.cula import gpu_solve, cula_available
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
@@ -83,7 +82,6 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
 from theano.tensor import nlinalg
-from theano.tensor import slinalg
 from theano.tensor.nnet.Conv3D import Conv3D
 from theano.tests.breakpoint import PdbBreakpoint
@@ -700,38 +698,6 @@ def local_gpu_dot22scalar(node):
    return False
-@register_opt()
-@local_optimizer([gpu_from_host, slinalg.Solve])
-def local_gpu_solve(node):
-    """
-    gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
-    CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
-    """
-    if not cula_available:
-        return
-    if node.outputs[0].dtype != 'float32':
-        return
-    if isinstance(node.op, GpuFromHost):
-        host_input = node.inputs[0]
-        if (host_input.owner and
-            isinstance(host_input.owner.op,
-                       slinalg.Solve)):
-            x, y = host_input.owner.inputs
-            return [gpu_solve(as_cuda_ndarray_variable(x),
-                              as_cuda_ndarray_variable(y))]
-    if isinstance(node.op, slinalg.Solve):
-        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
-                for i in node.inputs]):
-            x, y = node.inputs
-            return [host_from_gpu(
-                    gpu_solve(as_cuda_ndarray_variable(x),
-                              as_cuda_ndarray_variable(y)))]
-    return False
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
 def local_gpu_gemv(node):