提交 ab5904a4 authored 作者: Thomas George's avatar Thomas George

Added an optimization for inplace gpu_solve and tests

上级 b96a3223
...@@ -60,6 +60,11 @@ def attach_cusolver_handle_to_context(ctx): ...@@ -60,6 +60,11 @@ def attach_cusolver_handle_to_context(ctx):
with ctx: with ctx:
ctx.cusolver_handle = cusolver.cusolverDnCreate() ctx.cusolver_handle = cusolver.cusolverDnCreate()
# it is a subset of all cases available in slinalg's MATRIX_STRUCTURE
MATRIX_STRUCTURES_SOLVE = (
'general',
'symmetric')
class GpuCusolverSolve(Op): class GpuCusolverSolve(Op):
""" """
...@@ -79,7 +84,8 @@ class GpuCusolverSolve(Op): ...@@ -79,7 +84,8 @@ class GpuCusolverSolve(Op):
self.inplace = inplace self.inplace = inplace
self.A_structure = A_structure self.A_structure = A_structure
if self.inplace: if self.inplace:
self.destroy_map = {0: [0, 1]} self.destroy_map = {0: [0]}
assert A_structure in MATRIX_STRUCTURES_SOLVE
super(GpuCusolverSolve, self).__init__() super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
......
...@@ -70,7 +70,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -70,7 +70,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, GpuCholesky, cusolver_available) from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
cusolver_available)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -1974,7 +1975,17 @@ def local_gpu_maxandargmax(op, context_name, inputs, outputs): ...@@ -1974,7 +1975,17 @@ def local_gpu_maxandargmax(op, context_name, inputs, outputs):
def local_gpu_solve(op, context_name, inputs, outputs): def local_gpu_solve(op, context_name, inputs, outputs):
if not cusolver_available: if not cusolver_available:
return return
return GpuCusolverSolve() if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
return
return GpuCusolverSolve(A_structure=op.A_structure)
@register_inplace()
@local_optimizer([GpuCusolverSolve], inplace=True)
def local_inplace_gpu_solve(node):
if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
inplace=True)(*node.inputs)]
# Cholesky decomposition # Cholesky decomposition
......
...@@ -587,13 +587,30 @@ def test_local_lift_solve(): ...@@ -587,13 +587,30 @@ def test_local_lift_solve():
f_gpu = theano.function([A, b], o, mode=mode_with_gpu) f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Solve) assert not any(isinstance(n.op, slinalg.Solve)
for n in f_gpu.maker.fgraph.apply_nodes) for n in f_gpu.maker.fgraph.apply_nodes)
assert any(isinstance(n.op, GpuCusolverSolve) assert any(isinstance(n.op, GpuCusolverSolve) and n.op.inplace
for n in f_gpu.maker.fgraph.apply_nodes) for n in f_gpu.maker.fgraph.apply_nodes)
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32") b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val)) utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def test_gpu_solve_not_inplace():
if not cusolver_available:
raise SkipTest('No cuSolver')
A = tensor.fmatrix()
b = tensor.fmatrix()
s = slinalg.solve(A, b)
o = tensor.dot(A, s)
f_cpu = theano.function([A, b], o, mode_without_gpu)
f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
count_not_inplace = len([n.op for n in f_gpu.maker.fgraph.apply_nodes
if isinstance(n.op, GpuCusolverSolve) and not n.op.inplace])
assert count_not_inplace == 1, count_not_inplace
A_val = np.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = np.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
def test_local_lift_cholesky(): def test_local_lift_cholesky():
if not cusolver_available: if not cusolver_available:
raise SkipTest('No cuSolver') raise SkipTest('No cuSolver')
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论