提交 276b9e51 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5320 from tfjgeorge/cusolver-ccw

Gpu op for solve using cusolver (CCW #5191)
from __future__ import absolute_import, division, print_function
import pkg_resources
import theano
from theano import Op
from theano.gpuarray import basic_ops, GpuArrayType
try:
from pygpu import gpuarray
except ImportError:
pass
cusolver_available = False
try:
from skcuda import cusolver
cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
cusolver_handle = None
class GpuCusolverSolve(Op):
"""
CUSOLVER GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans',)
def __init__(self, trans='N', inplace=False):
self.trans = trans
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0, 1]}
super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2):
self.context = basic_ops.infer_context_name(inp1, inp2)
inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
inp1 = basic_ops.gpu_contiguous(inp1)
inp2 = basic_ops.gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2
assert inp2.ndim == 2
assert inp1.dtype == 'float32'
assert inp2.dtype == 'float32'
return theano.Apply(
self, [inp1, inp2],
[GpuArrayType('float32',
broadcastable=inp1.broadcastable,
context_name=self.context)()])
def make_thunk(self,
node,
storage_map, _,
no_recycling=[],
impl=None):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
global cusolver_handle
if cusolver_handle is None:
cusolver_handle = cusolver.cusolverDnCreate()
def thunk():
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0][0]
# Solution vectors.
b = inputs[1][0]
assert(len(A.shape) == 2)
assert(len(b.shape) == 2)
if self.trans in ['T', 'C']:
trans = 1
l, n = A.shape
k, m = b.shape
elif self.trans == 'N':
trans = 0
n, l = A.shape
k, m = b.shape
else:
raise ValueError('Invalid value for trans')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k, m)
# We copy A and b as cusolver operates inplace
b = gpuarray.array(b, copy=True, order='F')
if not self.inplace:
A = gpuarray.array(A, copy=True)
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
cusolver_handle, n, n, A_ptr, lda)
if (thunk.workspace is None or
thunk.workspace.size != workspace_size):
thunk.workspace = gpuarray.zeros((workspace_size,),
dtype='float32',
context=context)
if thunk.pivots is None or thunk.pivots.size != min(n, n):
thunk.pivots = gpuarray.zeros((min(n, n),),
dtype='float32',
context=context)
if thunk.dev_info is None:
thunk.dev_info = gpuarray.zeros((1,),
dtype='float32',
context=context)
workspace_ptr = thunk.workspace.gpudata
pivots_ptr = thunk.pivots.gpudata
dev_info_ptr = thunk.dev_info.gpudata
cusolver.cusolverDnSgetrf(
cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr)
cusolver.cusolverDnSgetrs(
cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
thunk.workspace = None
thunk.pivots = None
thunk.dev_info = None
return thunk
def gpu_solve(A, b, trans='N'):
return GpuCusolverSolve(trans)(A, b)
...@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv, ...@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d_gradWeights, AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs) AbstractConv3d_gradInputs)
import theano.tensor.signal.pool as pool import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
...@@ -69,6 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor, ...@@ -69,6 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20) GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, cusolver_available)
_logger = logging.getLogger("theano.gpuarray.opt") _logger = logging.getLogger("theano.gpuarray.opt")
...@@ -1911,6 +1913,16 @@ def _scan_type_infer(node): ...@@ -1911,6 +1913,16 @@ def _scan_type_infer(node):
def local_gpu_maxandargmax(op, context_name, inputs, outputs): def local_gpu_maxandargmax(op, context_name, inputs, outputs):
return GpuMaxAndArgmax(op.get_params(None)) return GpuMaxAndArgmax(op.get_params(None))
# solve
@register_opt('fast_compile')
@op_lifter([slinalg.Solve])
@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
def local_gpu_solve(op, context_name, inputs, outputs):
if not cusolver_available:
return
return GpuCusolverSolve()
# Do not register in fast_run or fast_compile. # Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled. # It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace', optdb.register('gpua_scanOp_make_inplace',
......
from __future__ import absolute_import, division, print_function
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available')
class TestCusolver(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val)
b_val_trans = numpy.dot(A_val.T, x_val)
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
b_trans = theano.tensor.matrix("b", dtype="float32")
solver = gpu_solve(A, b)
solver_trans = gpu_solve(A, b_trans, trans='T')
fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu)
res = fn(A_val, b_val, b_val_trans)
x_res = numpy.array(res[0])
x_res_trans = numpy.array(res[1])
utt.assert_allclose(x_res, x_val)
utt.assert_allclose(x_res_trans, x_val)
def test_diag_solve(self):
numpy.random.seed(1)
A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
dtype="float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = (A_val + A_val.T) / 2.0
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_sym, x_val)
def test_orth_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0]
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val)
...@@ -4,6 +4,7 @@ from nose.tools import assert_raises ...@@ -4,6 +4,7 @@ from nose.tools import assert_raises
import theano import theano
from theano import tensor from theano import tensor
import theano.tensor.slinalg as slinalg
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt, test_ifelse from theano.tests import unittest_tools as utt, test_ifelse
from theano.tensor.tests import test_basic from theano.tensor.tests import test_basic
...@@ -16,6 +17,7 @@ from ..basic_ops import ( ...@@ -16,6 +17,7 @@ from ..basic_ops import (
from ..blas import GpuGemm from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve
from .config import mode_with_gpu, test_ctx_name from .config import mode_with_gpu, test_ctx_name
...@@ -496,3 +498,18 @@ def test_no_complex(): ...@@ -496,3 +498,18 @@ def test_no_complex():
stft_out = tensor.exp(width_var * freq_var) * signal_var stft_out = tensor.exp(width_var * freq_var) * signal_var
theano.function([width_var, freq_var, signal_var], stft_out, theano.function([width_var, freq_var, signal_var], stft_out,
mode=mode_with_gpu) mode=mode_with_gpu)
def test_local_lift_solve():
A = tensor.fmatrix()
b = tensor.fmatrix()
o = slinalg.solve(A, b)
f_cpu = theano.function([A, b], o)
f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Solve)
for n in f_gpu.maker.fgraph.apply_nodes)
assert any(isinstance(n.op, GpuCusolverSolve)
for n in f_gpu.maker.fgraph.apply_nodes)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = numpy.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
...@@ -49,7 +49,6 @@ from theano.sandbox.cuda.blas import ( ...@@ -49,7 +49,6 @@ from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve, cula_available
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
...@@ -83,7 +82,6 @@ from theano.scan_module import scan_utils, scan_op, scan_opt ...@@ -83,7 +82,6 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg from theano.tensor import nlinalg
from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
...@@ -700,38 +698,6 @@ def local_gpu_dot22scalar(node): ...@@ -700,38 +698,6 @@ def local_gpu_dot22scalar(node):
return False return False
@register_opt()
@local_optimizer([gpu_from_host, slinalg.Solve])
def local_gpu_solve(node):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if not cula_available:
return
if node.outputs[0].dtype != 'float32':
return
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op,
slinalg.Solve)):
x, y = host_input.owner.inputs
return [gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y))]
if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(
gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y)))]
return False
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv]) @local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
def local_gpu_gemv(node): def local_gpu_gemv(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论