提交 e972e956 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #2306 from caglar/theano_solve

Theano solve GPU Cula Op
import theano
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.sandbox.cuda import cuda_ndarray
dimshuffle = cuda_ndarray.cuda_ndarray.dimshuffle
cula_available = False
try:
from scikits.cuda import cula
cula_available = True
except (ImportError, OSError):
pass
cula_initialized = False
class GpuSolve(GpuOp):
"""
CULA GPU solver OP.
:param trans: Whether to take the transpose of the input matrix
or not.
"""
__props__ = ('trans',)
def __init__(self, trans='N'):
self.trans = trans
super(GpuSolve, self).__init__()
def output_type(self, inp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_node(self, inp1, inp2):
inp1 = as_cuda_ndarray_variable(inp1)
inp2 = as_cuda_ndarray_variable(inp2)
assert inp1.ndim == 2
assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self,
node,
storage_map, _,
no_recycling=[]):
# Initialize CULA the first time it is needed
global cula_initialized
if cula_available and cula and not cula_initialized:
cula.culaInitialize()
cula_initialized = True
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
# size of the matrices to invert
z = outputs[0]
# Matrix
A = inputs[0][0]
# Solution vectors
b = inputs[1][0]
# A is not explicitly converted between C and F order, instead we
# switch the "transpose" flag
if self.trans in ('T', 'C'):
trans = 'N'
else:
trans = 'T'
# Convert b to F-order from c-order.
b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))
# This copy forces allocation of a new C-contiguous buffer
# and returns it.
A_cpy = A.copy()
b_cpy = b_cpy.copy()
def cula_gpu_solve(A_, b_, trans='T'):
A_shape = A_.shape
b_shape = b_.shape
assert(len(A_shape) == 2)
assert(len(b_shape) == 2)
if trans in ['T', 'C']:
l, n = A_shape
k, m = b_shape
if n != k:
raise ValueError('A and b must be aligned.')
elif trans in ['N']:
n, l = A_shape
k, m = b_shape
if l != m:
raise ValueError('A and b must be aligned.')
else:
raise ValueError('Invalid value for trans')
lda = max(1, n)
ldb = max(1, n, l)
# construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b.
A_ptr = A_.gpudata
b_ptr = b_.gpudata
cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
return A_, b_
A_pycuda, b_pycuda = cula_gpu_solve(A_cpy, b_cpy, trans)
# Convert b to F-order from c-order and assign it to output:
b_cpy = b_cpy.reshape(b.shape[::-1])
b_cpy = dimshuffle(b_cpy, (1, 0))
z[0] = b_cpy
thunk.inputs = inputs
thunk.outputs = outputs
thunk.lazy = False
return thunk
gpu_solve = GpuSolve()
...@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import (
GpuSubtensor, GpuAdvancedSubtensor1, GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad) GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias) GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import erfinv_gpu from theano.sandbox.cuda.elemwise import erfinv_gpu
...@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant ...@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg from theano.tensor import nlinalg
from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
try: try:
...@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node): ...@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node):
return False return False
@register_opt()
@local_optimizer([gpu_from_host, slinalg.Solve])
def local_gpu_solve(node):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op,
slinalg.Solve)):
x, y = host_input.owner.inputs
return [gpu_solve(gpu_from_host(x), gpu_from_host(y))]
if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(
gpu_solve(gpu_from_host(x),
gpu_from_host(y)))]
return False
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv]) @local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
def local_gpu_gemv(node): def local_gpu_gemv(node):
......
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda.cula import cula_available
if not pycuda_available:
raise SkipTest('Optional package pycuda not available')
if not cula_available:
raise SkipTest('Optional package scikits.cuda.cula not available')
from theano.sandbox.cuda import cula
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
class TestCula(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val)
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
solver = cula.gpu_solve(A, b)
fn = theano.function([A, b], [solver])
res = fn(A_val, b_val)
x_res = numpy.array(res[0])
utt.assert_allclose(x_res, x_val)
def test_diag_solve(self):
numpy.random.seed(1)
A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
dtype="float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = (A_val + A_val.T) / 2.0
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_sym, x_val)
def test_orth_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0]
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val)
...@@ -536,12 +536,38 @@ def test_erfinvgpu(): ...@@ -536,12 +536,38 @@ def test_erfinvgpu():
assert numpy.allclose(f(xv), f2(xv)) assert numpy.allclose(f(xv), f2(xv))
def test_local_gpu_solve():
numpy.random.seed(1)
def cmp(a_shp, b_shp):
a0 = numpy.random.uniform(-0.4, 0.4,
a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.uniform(-0.4, 0.4,
b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'b')
f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
cuda.cula.GpuSolve)
assert cuda.opt.local_gpu_solve.transform(
tensor.slinalg.solve(a, b).owner)
out = f()
assert numpy.allclose(numpy.dot(a0, out), b0)
cmp((6, 6), (6, 1))
cmp((5, 5), (5, 1))
def test_local_gpu_dot_to_dot22dot(): def test_local_gpu_dot_to_dot22dot():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a0 = numpy.random.rand(*a_shp).astype('float32') a0 = numpy.random.rand(*a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a') a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.rand(*b_shp).astype('float32') b0 = numpy.random.rand(*b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'a') b = cuda.shared_constructor(b0, 'b')
f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu) f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
assert cuda.opt.local_gpu_dot_to_dot22.transform( assert cuda.opt.local_gpu_dot_to_dot22.transform(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论