提交 76e434ad authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6182 from juancamilog/cublas_triangular_solve

Added GpuCublasTriangularSolve and the associated lifter optimizations
...@@ -32,6 +32,13 @@ try: ...@@ -32,6 +32,13 @@ try:
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass pass
cublas_available = False
try:
from skcuda import cublas
cublas_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
if cusolver_available: if cusolver_available:
# Add cusolver call as it is missing in skcuda # Add cusolver call as it is missing in skcuda
# SPOTRS # SPOTRS
...@@ -67,10 +74,20 @@ def attach_cusolver_handle_to_context(ctx): ...@@ -67,10 +74,20 @@ def attach_cusolver_handle_to_context(ctx):
with ctx: with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate() ctx.cusolver_handle = cusolver.cusolverDnCreate()
def attach_cublas_handle_to_context(ctx):
handle = getattr(ctx, 'cublas_handle', None)
if handle is None:
with ctx:
ctx.cublas_handle = cublas.cublasCreate()
# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE # it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
MATRIX_STRUCTURES_SOLVE = ( MATRIX_STRUCTURES_SOLVE = (
'general', 'general',
'symmetric') 'symmetric',
'lower_triangular',
'upper_triangular')
class GpuCusolverSolve(Op): class GpuCusolverSolve(Op):
...@@ -229,7 +246,126 @@ class GpuCusolverSolve(Op): ...@@ -229,7 +246,126 @@ class GpuCusolverSolve(Op):
z[0] = b z[0] = b
class GpuCublasTriangularSolve(Op):
"""
CUBLAS GPU Triangular Solve Op.
Parameters
----------
lower
Whether system is lower-triangular (True) or upper-triangular (False).
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans', 'lower')
def __init__(self, lower=True, trans='N'):
self.trans = trans
self.lower = lower
super(GpuCublasTriangularSolve, self).__init__()
def make_node(self, inp1, inp2):
if not cublas_available:
raise RuntimeError('CUBLAS is not available and '
'GpuCublasTriangularSolve Op can not be constructed.')
context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name)
inp2 = as_gpuarray_variable(inp2, context_name)
inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2
assert inp2.ndim in [1, 2]
assert inp1.dtype == 'float32'
assert inp2.dtype == 'float32'
return theano.Apply(self, [inp1, inp2],
[GpuArrayType('float32',
broadcastable=inp2.broadcastable,
context_name=context_name)()])
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
attach_cublas_handle_to_context(ctx)
def perform(self, node, inputs, outputs):
ctx = node.inputs[0].type.context
# Solution set
x = outputs[0]
# Matrix.
A = inputs[0]
# right hand side
b = inputs[1]
assert(len(A.shape) == 2)
assert(len(b.shape) in [1, 2])
# implicitly deal with the difference between C order
# and fortran order by flipping the trans and lower flags
lower = not self.lower
trans = self.trans
if trans in ['T', 'C']:
trans = 'N'
l, n = A.shape
elif trans == 'N':
trans = 'T'
n, l = A.shape
else:
raise ValueError('Invalid value for trans')
if b.ndim == 2:
k, m = b.shape
else:
k, = b.shape
m = 1
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# solution overwrites right hand side on exit
b = pygpu.array(b, copy=True, order='F')
A_ptr = A.gpudata
b_ptr = b.gpudata
# unit scalar used for multiplication
alpha = 1.0
# indicates matrix A is on left of B
side = 'l'
# set whether upper or lower part of matrix A stored
uplo = 'l' if lower else 'u'
# indicates elements on diagonal of matrix A may not be unity
diag = 'n'
with ctx:
if b.ndim == 1:
# matrix vector solve
cublas.cublasStrsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr, lda, b_ptr, 1)
else:
cublas.cublasStrsm(ctx.cublas_handle, side, uplo, trans, diag,
n, m, alpha, A_ptr, lda, b_ptr, ldb)
x[0] = b
def gpu_solve(A, b, A_structure='general', trans='N'): def gpu_solve(A, b, A_structure='general', trans='N'):
if A_structure == 'lower':
return GpuCublasTriangularSolve(True, trans)(A, b)
elif A_structure == 'upper':
return GpuCublasTriangularSolve(False, trans)(A, b)
return GpuCusolverSolve(A_structure, trans)(A, b) return GpuCusolverSolve(A_structure, trans)(A, b)
......
...@@ -77,7 +77,8 @@ from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims ...@@ -77,7 +77,8 @@ from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky, from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available, GpuMagmaMatrixInverse, gpu_svd, cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
GpuMagmaCholesky, gpu_qr, GpuMagmaEigh) GpuMagmaCholesky, gpu_qr, GpuMagmaEigh,
GpuCublasTriangularSolve, cublas_available)
from .neighbours import GpuImages2Neibs from .neighbours import GpuImages2Neibs
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -2122,13 +2123,21 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs): ...@@ -2122,13 +2123,21 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs):
@op_lifter([slinalg.Solve]) @op_lifter([slinalg.Solve])
@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile') @register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
def local_gpu_solve(op, context_name, inputs, outputs): def local_gpu_solve(op, context_name, inputs, outputs):
if not cusolver_available:
return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32']:
return return
if op.A_structure not in MATRIX_STRUCTURES_SOLVE: if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
return return
op = GpuCusolverSolve(A_structure=op.A_structure)
if op.A_structure in ['lower_triangular', 'upper_triangular']:
if not cublas_available:
return
lower = op.A_structure == 'lower_triangular'
op = GpuCublasTriangularSolve(lower)
else:
if not cusolver_available:
return
op = GpuCusolverSolve(A_structure=op.A_structure)
if inputs[0].dtype == 'float16': if inputs[0].dtype == 'float16':
return op(inputs[0].astype('float32'), return op(inputs[0].astype('float32'),
inputs[1].astype('float32')).astype('float16') inputs[1].astype('float32')).astype('float16')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论