提交 276b9e51 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5320 from tfjgeorge/cusolver-ccw

Gpu op for solve using cusolver (CCW #5191)
from __future__ import absolute_import, division, print_function
import pkg_resources
import theano
from theano import Op
from theano.gpuarray import basic_ops, GpuArrayType
try:
from pygpu import gpuarray
except ImportError:
pass
cusolver_available = False
try:
from skcuda import cusolver
cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass
cusolver_handle = None
class GpuCusolverSolve(Op):
"""
CUSOLVER GPU solver OP.
Parameters
----------
trans
Whether to take the transpose of the input matrix or not.
"""
__props__ = ('trans',)
def __init__(self, trans='N', inplace=False):
self.trans = trans
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0, 1]}
super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2):
self.context = basic_ops.infer_context_name(inp1, inp2)
inp1 = basic_ops.as_gpuarray_variable(inp1, self.context)
inp2 = basic_ops.as_gpuarray_variable(inp2, self.context)
inp1 = basic_ops.gpu_contiguous(inp1)
inp2 = basic_ops.gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2
assert inp2.ndim == 2
assert inp1.dtype == 'float32'
assert inp2.dtype == 'float32'
return theano.Apply(
self, [inp1, inp2],
[GpuArrayType('float32',
broadcastable=inp1.broadcastable,
context_name=self.context)()])
def make_thunk(self,
node,
storage_map, _,
no_recycling=[],
impl=None):
if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and '
'GpuCusolverSolve Op can not be constructed.')
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
global cusolver_handle
if cusolver_handle is None:
cusolver_handle = cusolver.cusolverDnCreate()
def thunk():
context = inputs[0][0].context
# Size of the matrices to invert.
z = outputs[0]
# Matrix.
A = inputs[0][0]
# Solution vectors.
b = inputs[1][0]
assert(len(A.shape) == 2)
assert(len(b.shape) == 2)
if self.trans in ['T', 'C']:
trans = 1
l, n = A.shape
k, m = b.shape
elif self.trans == 'N':
trans = 0
n, l = A.shape
k, m = b.shape
else:
raise ValueError('Invalid value for trans')
if l != n:
raise ValueError('A must be a square matrix')
if n != k:
raise ValueError('A and b must be aligned.')
lda = max(1, n)
ldb = max(1, k, m)
# We copy A and b as cusolver operates inplace
b = gpuarray.array(b, copy=True, order='F')
if not self.inplace:
A = gpuarray.array(A, copy=True)
A_ptr = A.gpudata
b_ptr = b.gpudata
# cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
cusolver_handle, n, n, A_ptr, lda)
if (thunk.workspace is None or
thunk.workspace.size != workspace_size):
thunk.workspace = gpuarray.zeros((workspace_size,),
dtype='float32',
context=context)
if thunk.pivots is None or thunk.pivots.size != min(n, n):
thunk.pivots = gpuarray.zeros((min(n, n),),
dtype='float32',
context=context)
if thunk.dev_info is None:
thunk.dev_info = gpuarray.zeros((1,),
dtype='float32',
context=context)
workspace_ptr = thunk.workspace.gpudata
pivots_ptr = thunk.pivots.gpudata
dev_info_ptr = thunk.dev_info.gpudata
cusolver.cusolverDnSgetrf(
cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr)
cusolver.cusolverDnSgetrs(
cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
thunk.workspace = None
thunk.pivots = None
thunk.dev_info = None
return thunk
def gpu_solve(A, b, trans='N'):
return GpuCusolverSolve(trans)(A, b)
......@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
AbstractConv3d_gradWeights,
AbstractConv3d_gradInputs)
import theano.tensor.signal.pool as pool
import theano.tensor.slinalg as slinalg
from theano.tests.breakpoint import PdbBreakpoint
......@@ -69,6 +70,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
from .reduction import GpuMaxAndArgmax
from .linalg import (GpuCusolverSolve, cusolver_available)
_logger = logging.getLogger("theano.gpuarray.opt")
......@@ -1911,6 +1913,16 @@ def _scan_type_infer(node):
def local_gpu_maxandargmax(op, context_name, inputs, outputs):
return GpuMaxAndArgmax(op.get_params(None))
# solve
@register_opt('fast_compile')
@op_lifter([slinalg.Solve])
@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
def local_gpu_solve(op, context_name, inputs, outputs):
if not cusolver_available:
return
return GpuCusolverSolve()
# Do not register in fast_run or fast_compile.
# It will be added to fast_run if the GPU is enabled.
optdb.register('gpua_scanOp_make_inplace',
......
from __future__ import absolute_import, division, print_function
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available')
class TestCusolver(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val)
b_val_trans = numpy.dot(A_val.T, x_val)
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
b_trans = theano.tensor.matrix("b", dtype="float32")
solver = gpu_solve(A, b)
solver_trans = gpu_solve(A, b_trans, trans='T')
fn = theano.function([A, b, b_trans], [solver, solver_trans], mode=mode_with_gpu)
res = fn(A_val, b_val, b_val_trans)
x_res = numpy.array(res[0])
x_res_trans = numpy.array(res[1])
utt.assert_allclose(x_res, x_val)
utt.assert_allclose(x_res_trans, x_val)
def test_diag_solve(self):
numpy.random.seed(1)
A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
dtype="float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = (A_val + A_val.T) / 2.0
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_sym, x_val)
def test_orth_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0]
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val)
......@@ -4,6 +4,7 @@ from nose.tools import assert_raises
import theano
from theano import tensor
import theano.tensor.slinalg as slinalg
from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt, test_ifelse
from theano.tensor.tests import test_basic
......@@ -16,6 +17,7 @@ from ..basic_ops import (
from ..blas import GpuGemm
from ..elemwise import GpuCAReduceCuda, GpuCAReduceCPY, GpuElemwise
from ..subtensor import GpuSubtensor
from ..linalg import GpuCusolverSolve
from .config import mode_with_gpu, test_ctx_name
......@@ -496,3 +498,18 @@ def test_no_complex():
stft_out = tensor.exp(width_var * freq_var) * signal_var
theano.function([width_var, freq_var, signal_var], stft_out,
mode=mode_with_gpu)
def test_local_lift_solve():
A = tensor.fmatrix()
b = tensor.fmatrix()
o = slinalg.solve(A, b)
f_cpu = theano.function([A, b], o)
f_gpu = theano.function([A, b], o, mode=mode_with_gpu)
assert not any(isinstance(n.op, slinalg.Solve)
for n in f_gpu.maker.fgraph.apply_nodes)
assert any(isinstance(n.op, GpuCusolverSolve)
for n in f_gpu.maker.fgraph.apply_nodes)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
b_val = numpy.random.uniform(-0.4, 0.4, (5, 3)).astype("float32")
utt.assert_allclose(f_cpu(A_val, b_val), f_gpu(A_val, b_val))
......@@ -49,7 +49,6 @@ from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve, cula_available
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
......@@ -83,7 +82,6 @@ from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg
from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D
from theano.tests.breakpoint import PdbBreakpoint
......@@ -700,38 +698,6 @@ def local_gpu_dot22scalar(node):
return False
@register_opt()
@local_optimizer([gpu_from_host, slinalg.Solve])
def local_gpu_solve(node):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if not cula_available:
return
if node.outputs[0].dtype != 'float32':
return
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op,
slinalg.Solve)):
x, y = host_input.owner.inputs
return [gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y))]
if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(
gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y)))]
return False
@register_opt()
@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
def local_gpu_gemv(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论