提交 627b63af authored 作者: wonghang's avatar wonghang

Add float64 support for GpuCholesky and GpuCusolverSolve

上级 13be5b67
...@@ -125,15 +125,13 @@ class GpuCusolverSolve(Op): ...@@ -125,15 +125,13 @@ class GpuCusolverSolve(Op):
inp1 = gpu_contiguous(inp1) inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2) inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2 assert inp1.ndim == 2
assert inp2.ndim == 2 assert inp2.ndim == 2
assert inp1.dtype == 'float32' assert inp1.dtype == inp2.dtype
assert inp2.dtype == 'float32'
return theano.Apply( return theano.Apply(
self, [inp1, inp2], self, [inp1, inp2],
[GpuArrayType('float32', [GpuArrayType(inp1.dtype,
broadcastable=inp1.broadcastable, broadcastable=inp1.broadcastable,
context_name=context_name)()]) context_name=context_name)()])
...@@ -192,12 +190,29 @@ class GpuCusolverSolve(Op): ...@@ -192,12 +190,29 @@ class GpuCusolverSolve(Op):
if A.flags['C_CONTIGUOUS']: if A.flags['C_CONTIGUOUS']:
trans = 1 - trans trans = 1 - trans
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
potrs = cusolverDnSpotrs
getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
getrf = cusolver.cusolverDnSgetrf
getrs = cusolver.cusolverDnSgetrs
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
potrs = cusolverDnDpotrs
getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
getrf = cusolver.cusolverDnDgetrf
getrs = cusolver.cusolverDnDgetrs
else:
raise ValueError("Unsupported dtype")
if self.A_structure == 'symmetric': if self.A_structure == 'symmetric':
with context: with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize( workspace_size = potrf_bufferSize(
context.cusolver_handle, 0, n, A_ptr, lda) context.cusolver_handle, 0, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context) context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context)
...@@ -206,22 +221,22 @@ class GpuCusolverSolve(Op): ...@@ -206,22 +221,22 @@ class GpuCusolverSolve(Op):
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
with context: with context:
cusolver.cusolverDnSpotrf( potrf(
context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr) workspace_size, dev_info_ptr)
self.check_dev_info(dev_info) self.check_dev_info(dev_info)
cusolverDnSpotrs( potrs(
context.cusolver_handle, 0, n, m, A_ptr, lda, context.cusolver_handle, 0, n, m, A_ptr, lda,
b_ptr, ldb, dev_info_ptr) b_ptr, ldb, dev_info_ptr)
else: else:
# general case for A # general case for A
with context: with context:
workspace_size = cusolver.cusolverDnSgetrf_bufferSize( workspace_size = getrf_bufferSize(
context.cusolver_handle, n, n, A_ptr, lda) context.cusolver_handle, n, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context) context=context)
pivots = pygpu.zeros(n, dtype='int32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context)
...@@ -233,12 +248,12 @@ class GpuCusolverSolve(Op): ...@@ -233,12 +248,12 @@ class GpuCusolverSolve(Op):
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
with context: with context:
cusolver.cusolverDnSgetrf( getrf(
context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr) pivots_ptr, dev_info_ptr)
self.check_dev_info(dev_info) self.check_dev_info(dev_info)
cusolver.cusolverDnSgetrs( getrs(
context.cusolver_handle, trans, n, m, A_ptr, lda, context.cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr) pivots_ptr, b_ptr, ldb, dev_info_ptr)
...@@ -275,14 +290,12 @@ class GpuCublasTriangularSolve(Op): ...@@ -275,14 +290,12 @@ class GpuCublasTriangularSolve(Op):
inp1 = gpu_contiguous(inp1) inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2) inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2 assert inp1.ndim == 2
assert inp2.ndim in [1, 2] assert inp2.ndim in [1, 2]
assert inp1.dtype == 'float32' assert inp1.dtype == inp2.dtype
assert inp2.dtype == 'float32'
return theano.Apply(self, [inp1, inp2], return theano.Apply(self, [inp1, inp2],
[GpuArrayType('float32', [GpuArrayType(inp1.dtype,
broadcastable=inp2.broadcastable, broadcastable=inp2.broadcastable,
context_name=context_name)()]) context_name=context_name)()])
...@@ -347,13 +360,22 @@ class GpuCublasTriangularSolve(Op): ...@@ -347,13 +360,22 @@ class GpuCublasTriangularSolve(Op):
# indicates elements on diagonal of matrix A may not be unity # indicates elements on diagonal of matrix A may not be unity
diag = 'n' diag = 'n'
if A.dtype == 'float32':
trsv = cublas.cublasStrsv
trsm = cublas.cublasStrsm
elif A.dtype == 'float64':
trsv = cublas.cublasDtrsv
trsm = cublas.cublasDtrsm
else:
raise ValueError("Unsupported dtype")
with ctx: with ctx:
if b.ndim == 1: if b.ndim == 1:
# matrix vector solve # matrix vector solve
cublas.cublasStrsv(ctx.cublas_handle, uplo, trans, diag, n, trsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr, lda, b_ptr, 1) A_ptr, lda, b_ptr, 1)
else: else:
cublas.cublasStrsm(ctx.cublas_handle, side, uplo, trans, diag, trsm(ctx.cublas_handle, side, uplo, trans, diag,
n, m, alpha, A_ptr, lda, b_ptr, ldb) n, m, alpha, A_ptr, lda, b_ptr, ldb)
x[0] = b x[0] = b
...@@ -411,11 +433,7 @@ class GpuCholesky(Op): ...@@ -411,11 +433,7 @@ class GpuCholesky(Op):
inp = gpu_contiguous(inp) inp = gpu_contiguous(inp)
# this op can only operate on float32 matrices
# because of current implementation of triu/tril.
# TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
assert inp.ndim == 2 assert inp.ndim == 2
assert inp.dtype == 'float32'
return theano.Apply(self, [inp], [inp.type()]) return theano.Apply(self, [inp], [inp.type()])
...@@ -453,11 +471,20 @@ class GpuCholesky(Op): ...@@ -453,11 +471,20 @@ class GpuCholesky(Op):
L_ptr = L.gpudata L_ptr = L.gpudata
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
else:
raise ValueError("Unsupported dtype")
with context: with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize( workspace_size = potrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda) context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context) context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context)
...@@ -465,7 +492,7 @@ class GpuCholesky(Op): ...@@ -465,7 +492,7 @@ class GpuCholesky(Op):
workspace_ptr = workspace.gpudata workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
cusolver.cusolverDnSpotrf( potrf(
context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr) workspace_size, dev_info_ptr)
......
...@@ -214,6 +214,97 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -214,6 +214,97 @@ class TestGpuCholesky(unittest.TestCase):
fn = self.get_gpu_cholesky_func(True, False) fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val) self.assertRaises(LinAlgError, fn, A_val)
class TestGpuCholesky64(unittest.TestCase):
def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = theano.tensor.matrix("A", dtype="float64")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace,
mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
if not imported_scipy:
self.skipTest('SciPy is not enabled, skipping test')
A = theano.tensor.matrix("A", dtype="float64")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuCholesky)
for node in fn.maker.fgraph.toposort()])
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float64")
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(ValueError, fn, A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = theano.tensor.vector("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = theano.tensor.tensor3("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
@utt.assertFailure_fast
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
@utt.assertFailure_fast
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float64")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
class TestMagma(unittest.TestCase): class TestMagma(unittest.TestCase):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论