提交 375bc8c6 authored 作者: Yaroslav Ganin's avatar Yaroslav Ganin 提交者: Thomas George

Addressed Fred's concerns

上级 25fd044f
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, division, print_function
import pkg_resources
import pkg_resources
import theano import theano
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import CudaNdarray, GpuOp from theano.sandbox.cuda import CudaNdarray, GpuOp
from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
from theano.sandbox.cuda.type import CudaNdarrayType
try: try:
from theano.sandbox.cuda import cuda_ndarray from theano.sandbox.cuda import cuda_ndarray
...@@ -20,9 +21,10 @@ try: ...@@ -20,9 +21,10 @@ try:
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass pass
cusolver_handle = [None] cusolver_handle = None
class GpuSolve(GpuOp): class GpuCusolverSolve(GpuOp):
""" """
CUSOLVER GPU solver OP. CUSOLVER GPU solver OP.
...@@ -37,10 +39,7 @@ class GpuSolve(GpuOp): ...@@ -37,10 +39,7 @@ class GpuSolve(GpuOp):
def __init__(self, trans='N'): def __init__(self, trans='N'):
self.trans = trans self.trans = trans
super(GpuSolve, self).__init__() super(GpuCusolverSolve, self).__init__()
def output_type(self, inp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
inp1 = as_cuda_ndarray_variable(inp1) inp1 = as_cuda_ndarray_variable(inp1)
...@@ -48,7 +47,9 @@ class GpuSolve(GpuOp): ...@@ -48,7 +47,9 @@ class GpuSolve(GpuOp):
assert inp1.ndim == 2 assert inp1.ndim == 2
assert inp2.ndim == 2 assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()]) return theano.Apply(
self, [inp1, inp2],
[CudaNdarrayType(broadcastable=[False] * inp1.type.ndim)()])
def make_thunk(self, def make_thunk(self,
node, node,
...@@ -56,29 +57,31 @@ class GpuSolve(GpuOp): ...@@ -56,29 +57,31 @@ class GpuSolve(GpuOp):
no_recycling=[]): no_recycling=[]):
if not cusolver_available: if not cusolver_available:
raise RuntimeError('CUSOLVER is not available and ' raise RuntimeError('CUSOLVER is not available and '
'GpuSolve Op can not be constructed.') 'GpuCusolverSolve Op can not be constructed.')
inputs = [storage_map[v] for v in node.inputs] inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs] outputs = [storage_map[v] for v in node.outputs]
def thunk(): def thunk():
# size of the matrices to invert global cusolver_handle
# Size of the matrices to invert.
z = outputs[0] z = outputs[0]
# Matrix # Matrix.
A = inputs[0][0] A = inputs[0][0]
# Solution vectors # Solution vectors.
b = inputs[1][0] b = inputs[1][0]
# A is not explicitly converted between C and F order, instead we # A is not explicitly converted between C and F order, instead we
# switch the "transpose" flag # switch the "transpose" flag.
if self.trans in ('T', 'C'): if self.trans in ('T', 'C'):
trans = 'N' trans = 'N'
else: else:
trans = 'T' trans = 'T'
# Convert b to F-order from c-order. # Convert b to F-order from C-order.
b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1])) b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))
# This copy forces allocation of a new C-contiguous buffer # This copy forces allocation of a new C-contiguous buffer
...@@ -86,23 +89,19 @@ class GpuSolve(GpuOp): ...@@ -86,23 +89,19 @@ class GpuSolve(GpuOp):
A_cpy = A.copy() A_cpy = A.copy()
b_cpy = b_cpy.copy() b_cpy = b_cpy.copy()
def cusolver_gpu_solve(A_, b_, trans='T'): assert(len(A.shape) == 2)
A_shape = A_.shape assert(len(b.shape) == 2)
b_shape = b_.shape
assert(len(A_shape) == 2)
assert(len(b_shape) == 2)
if trans in ['T', 'C']: if trans in ['T', 'C']:
trans = 1 trans = 1
l, n = A_shape l, n = A.shape
k, m = b_shape k, m = b.shape
if n != k: if n != k:
raise ValueError('A and b must be aligned.') raise ValueError('A and b must be aligned.')
elif trans in ['N']: elif trans in ['N']:
trans = 0 trans = 0
n, l = A_shape n, l = A.shape
k, m = b_shape k, m = b.shape
if l != m: if l != m:
raise ValueError('A and b must be aligned.') raise ValueError('A and b must be aligned.')
else: else:
...@@ -111,14 +110,14 @@ class GpuSolve(GpuOp): ...@@ -111,14 +110,14 @@ class GpuSolve(GpuOp):
lda = max(1, n) lda = max(1, n)
ldb = max(1, n, l) ldb = max(1, n, l)
A_ptr = A_.gpudata A_ptr = A_cpy.gpudata
b_ptr = b_.gpudata b_ptr = b_cpy.gpudata
if cusolver_handle[0] is None: if cusolver_handle is None:
cusolver_handle[0] = cusolver.cusolverDnCreate() cusolver_handle = cusolver.cusolverDnCreate()
workspace_size = cusolver.cusolverDnSgetrf_bufferSize( workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
cusolver_handle[0], m, n, A_ptr, lda) cusolver_handle, m, n, A_ptr, lda)
if (thunk.workspace is None or if (thunk.workspace is None or
thunk.workspace.size != workspace_size): thunk.workspace.size != workspace_size):
...@@ -135,18 +134,14 @@ class GpuSolve(GpuOp): ...@@ -135,18 +134,14 @@ class GpuSolve(GpuOp):
dev_info_ptr = thunk.dev_info.gpudata dev_info_ptr = thunk.dev_info.gpudata
cusolver.cusolverDnSgetrf( cusolver.cusolverDnSgetrf(
cusolver_handle[0], n, l, A_ptr, lda, workspace_ptr, cusolver_handle, n, l, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr) pivots_ptr, dev_info_ptr)
cusolver.cusolverDnSgetrs( cusolver.cusolverDnSgetrs(
cusolver_handle[0], trans, n, m, A_ptr, lda, cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr) pivots_ptr, b_ptr, ldb, dev_info_ptr)
return A_, b_ # Convert b to F-order from C-order and assign it to output.
A_pycuda, b_pycuda = cusolver_gpu_solve(A_cpy, b_cpy, trans)
# Convert b to F-order from c-order and assign it to output:
b_cpy = b_cpy.reshape(b.shape[::-1]) b_cpy = b_cpy.reshape(b.shape[::-1])
b_cpy = dimshuffle(b_cpy, (1, 0)) b_cpy = dimshuffle(b_cpy, (1, 0))
z[0] = b_cpy z[0] = b_cpy
...@@ -161,4 +156,4 @@ class GpuSolve(GpuOp): ...@@ -161,4 +156,4 @@ class GpuSolve(GpuOp):
return thunk return thunk
gpu_solve = GpuSolve() gpu_solve = GpuCusolverSolve()
from __future__ import absolute_import, division, print_function
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda.cusolver import cusolver_available
from theano.sandbox.cuda import cusolver
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if not pycuda_available:
raise SkipTest('Optional package pycuda not available')
if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available')
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
class TestCula(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val)
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
solver = cusolver.gpu_solve(A, b)
fn = theano.function([A, b], [solver])
res = fn(A_val, b_val)
x_res = numpy.array(res[0])
utt.assert_allclose(x_res, x_val)
def test_diag_solve(self):
numpy.random.seed(1)
A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
dtype="float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = (A_val + A_val.T) / 2.0
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_sym, x_val)
def test_orth_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0]
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
1)).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论