提交 5622fc2a authored 作者: Arnaud Bergeron's avatar Arnaud Bergeron

Fix remaining problems.

上级 86be5809
......@@ -18,8 +18,6 @@ try:
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
cusolver_handle = None
class GpuCusolverSolve(Op):
"""
......@@ -32,7 +30,7 @@ class GpuCusolverSolve(Op):
"""
__props__ = ('trans',)
__props__ = ('trans', 'inplace')
def __init__(self, trans='N', inplace=False):
self.trans = trans
......@@ -42,10 +40,13 @@ class GpuCusolverSolve(Op):
super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2):
self.context = basic_ops.infer_context_name(inp1, inp2)
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
context_name = basic_ops.infer_context_name(inp1, inp2)
inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)
inp1 = basic_ops.gpu_contiguous(inp1)
inp2 = basic_ops.gpu_contiguous(inp2)
......@@ -62,91 +63,75 @@ class GpuCusolverSolve(Op):
broadcastable=inp1.broadcastable,
context_name=self.context)()])
def make_thunk(self,
node,
storage_map, _,
no_recycling=[],
impl=None):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
global cusolver_handle
if cusolver_handle is None:
cusolver_handle = cusolver.cusolverDnCreate()
def thunk():
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0][0]
# Solution vectors.
b = inputs[1][0]
assert(len(A.shape) == 2)
assert(len(b.shape) == 2)
if self.trans in ['T', 'C']:
trans = 1
l, n = A.shape
k, m = b.shape
elif self.trans == 'N':
trans = 0
n, l = A.shape
k, m = b.shape
else:
raise ValueError('Invalid value for trans')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# We copy A and b as cusolver operates inplace
b = pygpu.array(b, copy=True, order='F')
if not self.inplace:
A = pygpu.array(A, copy=True)
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
def prepare_node(self, node, storage_map, compute_map, impl):
ctx = node.inputs[0].type.context
handle = getattr(ctx, 'cusolver_handle', None)
if handle is None:
with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate()
def perform(self, node, inputs, outputs):
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0]
# Solution vectors.
b = inputs[1]
assert(len(A.shape) == 2)
assert(len(b.shape) == 2)
if self.trans in ['T', 'C']:
trans = 1
l, n = A.shape
k, m = b.shape
elif self.trans == 'N':
trans = 0
n, l = A.shape
k, m = b.shape
else:
raise ValueError('Invalid value for trans')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k)
# We copy A and b as cusolver operates inplace
b = pygpu.array(b, copy=True, order='F')
if not self.inplace:
A = pygpu.array(A, copy=True)
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
with context:
workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
cusolver_handle, n, n, A_ptr, lda)
if (thunk.workspace is None or
thunk.workspace.size != workspace_size):
thunk.workspace = pygpu.zeros(workspace_size,
dtype='float32',
context=context)
workspace = pygpu.zeros(workspace_size, dtype='float32',
context=context)
if thunk.pivots is None or thunk.pivots.size != min(n, n):
thunk.pivots = pygpu.zeros(n,
dtype='int32',
context=context)
pivots = pygpu.zeros(n, dtype='int32', context=context)
if thunk.dev_info is None:
thunk.dev_info = pygpu.zeros((1,),
dtype='int32',
context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context)
workspace_ptr = thunk.workspace.gpudata
pivots_ptr = thunk.pivots.gpudata
dev_info_ptr = thunk.dev_info.gpudata
workspace_ptr = thunk.workspace.gpudata
pivots_ptr = thunk.pivots.gpudata
dev_info_ptr = thunk.dev_info.gpudata
with context:
cusolver.cusolverDnSgetrf(
cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr)
......@@ -155,17 +140,7 @@ class GpuCusolverSolve(Op):
cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
thunk.workspace = None
thunk.pivots = None
thunk.dev_info = None
return thunk
z[0] = b
def gpu_solve(A, b, trans='N'):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论