提交 82901783 authored 作者: Simon Lefrancois's avatar Simon Lefrancois 提交者: GitHub

Merge pull request #5421 from tfjgeorge/cusolver

Cusolver using Cholesky decomposition for symmetric matrices
...@@ -2,23 +2,54 @@ from __future__ import absolute_import, division, print_function ...@@ -2,23 +2,54 @@ from __future__ import absolute_import, division, print_function
import pkg_resources import pkg_resources
import theano import theano
import warnings
from theano import Op from theano import Op
from theano.gpuarray import basic_ops, GpuArrayType from theano.gpuarray import basic_ops, GpuArrayType
import numpy
from numpy.linalg.linalg import LinAlgError
try: try:
from pygpu import gpuarray import pygpu
except ImportError: except ImportError:
pass pass
cusolver_available = False cusolver_available = False
try: try:
import skcuda
from skcuda import cusolver from skcuda import cusolver
cusolver_available = True cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass pass
cusolver_handle = None if cusolver_available:
# Add cusolver call as it is missing in skcuda
# SPOTRS
cusolver._libcusolver.cusolverDnSpotrs.restype = int
cusolver._libcusolver.cusolverDnSpotrs.argtypes = [cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p]
def cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda,
B, ldb, devInfo):
"""
Solve real single precision linear system for hermitian matrices.
References
----------
`cusolverDn<t>potrs <http://docs.nvidia.com/cuda/cusolver/index.html#cuds-lt-t-gt-potrs>`_
"""
status = cusolver._libcusolver.cusolverDnSpotrs(handle, uplo, n, nrhs,
int(A), lda, int(B),
ldb, int(devInfo))
cusolver.cusolverCheckStatus(status)
class GpuCusolverSolve(Op): class GpuCusolverSolve(Op):
...@@ -32,20 +63,26 @@ class GpuCusolverSolve(Op): ...@@ -32,20 +63,26 @@ class GpuCusolverSolve(Op):
""" """
__props__ = ('trans',) __props__ = ('A_structure', 'trans', 'inplace')
def __init__(self, trans='N', inplace=False): def __init__(self, A_structure='general', trans='N', inplace=False):
self.trans = trans self.trans = trans
self.inplace = inplace self.inplace = inplace
self.A_structure = A_structure
if self.inplace: if self.inplace:
self.destroy_map = {0: [0, 1]} self.destroy_map = {0: [0, 1]}
super(GpuCusolverSolve, self).__init__() super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
self.context = basic_ops.infer_context_name(inp1, inp2) if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
context_name = basic_ops.infer_context_name(inp1, inp2)
inp1 = basic_ops.as_gpuarray_variable(inp1, self.context) inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
inp2 = basic_ops.as_gpuarray_variable(inp2, self.context) inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)
inp1 = basic_ops.gpu_contiguous(inp1) inp1 = basic_ops.gpu_contiguous(inp1)
inp2 = basic_ops.gpu_contiguous(inp2) inp2 = basic_ops.gpu_contiguous(inp2)
...@@ -60,35 +97,31 @@ class GpuCusolverSolve(Op): ...@@ -60,35 +97,31 @@ class GpuCusolverSolve(Op):
self, [inp1, inp2], self, [inp1, inp2],
[GpuArrayType('float32', [GpuArrayType('float32',
broadcastable=inp1.broadcastable, broadcastable=inp1.broadcastable,
context_name=self.context)()]) context_name=context_name)()])
def make_thunk(self,
node,
storage_map, _,
no_recycling=[],
impl=None):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
inputs = [storage_map[v] for v in node.inputs] def prepare_node(self, node, storage_map, compute_map, impl):
outputs = [storage_map[v] for v in node.outputs] ctx = node.inputs[0].type.context
handle = getattr(ctx, 'cusolver_handle', None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
global cusolver_handle def check_dev_info(self, dev_info):
if cusolver_handle is None: val = numpy.asarray(dev_info)[0]
cusolver_handle = cusolver.cusolverDnCreate() if val > 0:
raise LinAlgError('A is singular')
def thunk(): def perform(self, node, inputs, outputs):
context = inputs[0][0].context context = inputs[0][0].context
# Size of the matrices to invert. # Size of the matrices to invert.
z = outputs[0] z = outputs[0]
# Matrix. # Matrix.
A = inputs[0][0] A = inputs[0]
# Solution vectors. # Solution vectors.
b = inputs[1][0] b = inputs[1]
assert(len(A.shape) == 2) assert(len(A.shape) == 2)
assert(len(b.shape) == 2) assert(len(b.shape) == 2)
...@@ -109,12 +142,12 @@ class GpuCusolverSolve(Op): ...@@ -109,12 +142,12 @@ class GpuCusolverSolve(Op):
raise ValueError('A and b must be aligned.') raise ValueError('A and b must be aligned.')
lda = max(1, n) lda = max(1, n)
ldb = max(1, k, m) ldb = max(1, k)
# We copy A and b as cusolver operates inplace # We copy A and b as cusolver operates inplace
b = gpuarray.array(b, copy=True, order='F') b = pygpu.array(b, copy=True, order='F')
if not self.inplace: if not self.inplace:
A = gpuarray.array(A, copy=True) A = pygpu.array(A, copy=True)
A_ptr = A.gpudata A_ptr = A.gpudata
b_ptr = b.gpudata b_ptr = b.gpudata
...@@ -124,49 +157,58 @@ class GpuCusolverSolve(Op): ...@@ -124,49 +157,58 @@ class GpuCusolverSolve(Op):
if A.flags['C_CONTIGUOUS']: if A.flags['C_CONTIGUOUS']:
trans = 1 - trans trans = 1 - trans
workspace_size = cusolver.cusolverDnSgetrf_bufferSize( if self.A_structure == 'symmetric':
cusolver_handle, n, n, A_ptr, lda) with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
context.cusolver_handle, 0, n, A_ptr, lda)
if (thunk.workspace is None or workspace = pygpu.zeros(workspace_size, dtype='float32',
thunk.workspace.size != workspace_size):
thunk.workspace = gpuarray.zeros((workspace_size,),
dtype='float32',
context=context) context=context)
if thunk.pivots is None or thunk.pivots.size != min(n, n): dev_info = pygpu.zeros((1,), dtype='int32', context=context)
thunk.pivots = gpuarray.zeros((min(n, n),),
dtype='float32', workspace_ptr = workspace.gpudata
context=context) dev_info_ptr = dev_info.gpudata
with context:
cusolver.cusolverDnSpotrf(
context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr)
self.check_dev_info(dev_info)
cusolverDnSpotrs(
context.cusolver_handle, 0, n, m, A_ptr, lda,
b_ptr, ldb, dev_info_ptr)
else:
# general case for A
with context:
workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
context.cusolver_handle, n, n, A_ptr, lda)
if thunk.dev_info is None: workspace = pygpu.zeros(workspace_size, dtype='float32',
thunk.dev_info = gpuarray.zeros((1,),
dtype='float32',
context=context) context=context)
workspace_ptr = thunk.workspace.gpudata pivots = pygpu.zeros(n, dtype='int32', context=context)
pivots_ptr = thunk.pivots.gpudata
dev_info_ptr = thunk.dev_info.gpudata dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = workspace.gpudata
pivots_ptr = pivots.gpudata
dev_info_ptr = dev_info.gpudata
with context:
cusolver.cusolverDnSgetrf( cusolver.cusolverDnSgetrf(
cusolver_handle, n, n, A_ptr, lda, workspace_ptr, context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr) pivots_ptr, dev_info_ptr)
self.check_dev_info(dev_info)
cusolver.cusolverDnSgetrs( cusolver.cusolverDnSgetrs(
cusolver_handle, trans, n, m, A_ptr, lda, context.cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr) pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b z[0] = b
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
thunk.workspace = None
thunk.pivots = None
thunk.dev_info = None
return thunk
def gpu_solve(A, b, trans='N'): def gpu_solve(A, b, A_structure='general', trans='N'):
return GpuCusolverSolve(trans)(A, b) return GpuCusolverSolve(A_structure, trans)(A, b)
...@@ -7,6 +7,8 @@ import theano ...@@ -7,6 +7,8 @@ import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .config import mode_with_gpu from .config import mode_with_gpu
from numpy.linalg.linalg import LinAlgError
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
from theano.gpuarray.linalg import (cusolver_available, gpu_solve) from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
...@@ -16,7 +18,7 @@ if not cusolver_available: ...@@ -16,7 +18,7 @@ if not cusolver_available:
class TestCusolver(unittest.TestCase): class TestCusolver(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val): def run_gpu_solve(self, A_val, x_val, A_struct=None):
b_val = numpy.dot(A_val, x_val) b_val = numpy.dot(A_val, x_val)
b_val_trans = numpy.dot(A_val.T, x_val) b_val_trans = numpy.dot(A_val.T, x_val)
...@@ -24,14 +26,19 @@ class TestCusolver(unittest.TestCase): ...@@ -24,14 +26,19 @@ class TestCusolver(unittest.TestCase):
b = theano.tensor.matrix("b", dtype="float32") b = theano.tensor.matrix("b", dtype="float32")
b_trans = theano.tensor.matrix("b", dtype="float32") b_trans = theano.tensor.matrix("b", dtype="float32")
if A_struct is None:
solver = gpu_solve(A, b) solver = gpu_solve(A, b)
solver_trans = gpu_solve(A, b_trans, trans='T') solver_trans = gpu_solve(A, b_trans, trans='T')
else:
solver = gpu_solve(A, b, A_struct)
solver_trans = gpu_solve(A, b_trans, A_struct, trans='T')
fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu) fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu)
res = fn(A_val, b_val, b_val_trans) res = fn(A_val, b_val, b_val_trans)
x_res = numpy.array(res[0]) x_res = numpy.array(res[0])
x_res_trans = numpy.array(res[1]) x_res_trans = numpy.array(res[1])
utt.assert_allclose(x_res, x_val) utt.assert_allclose(x_val, x_res)
utt.assert_allclose(x_res_trans, x_val) utt.assert_allclose(x_val, x_res_trans)
def test_diag_solve(self): def test_diag_solve(self):
numpy.random.seed(1) numpy.random.seed(1)
...@@ -41,13 +48,24 @@ class TestCusolver(unittest.TestCase): ...@@ -41,13 +48,24 @@ class TestCusolver(unittest.TestCase):
1)).astype("float32") 1)).astype("float32")
self.run_gpu_solve(A_val, x_val) self.run_gpu_solve(A_val, x_val)
def test_bshape_solve(self):
"""
Test when shape of b (k, m) is such as m > k
"""
numpy.random.seed(1)
A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
dtype="float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
A_val.shape[1] + 1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self): def test_sym_solve(self):
numpy.random.seed(1) numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = (A_val + A_val.T) / 2.0 A_sym = numpy.dot(A_val, A_val.T)
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32") 1)).astype("float32")
self.run_gpu_solve(A_sym, x_val) self.run_gpu_solve(A_sym, x_val, 'symmetric')
def test_orth_solve(self): def test_orth_solve(self):
numpy.random.seed(1) numpy.random.seed(1)
...@@ -63,3 +81,34 @@ class TestCusolver(unittest.TestCase): ...@@ -63,3 +81,34 @@ class TestCusolver(unittest.TestCase):
x_val = numpy.random.uniform(-0.4, 0.4, x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32") (A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val) self.run_gpu_solve(A_val, x_val)
def test_linalgerrsym_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
A_val = numpy.dot(A_val.T, A_val)
# make A singular
A_val[:, 2] = A_val[:, 1] + A_val[:, 3]
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
solver = gpu_solve(A, b, 'symmetric')
fn = theano.function([A, b], [solver], mode=mode_with_gpu)
self.assertRaises(LinAlgError, fn, A_val, x_val)
def test_linalgerr_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
# make A singular
A_val[:, 2] = 0
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
solver = gpu_solve(A, b, trans='T')
fn = theano.function([A, b], [solver], mode=mode_with_gpu)
self.assertRaises(LinAlgError, fn, A_val, x_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论