提交 90cd2169 authored 作者: Thomas George's avatar Thomas George

implementation cusolver solve general case + tests

上级 b8f3b638
...@@ -10,7 +10,7 @@ from pygpu import gpuarray ...@@ -10,7 +10,7 @@ from pygpu import gpuarray
cusolver_available = False cusolver_available = False
try: try:
from scikits.cuda import cusolver from skcuda import cusolver
cusolver_available = True cusolver_available = True
except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound): except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
pass pass
...@@ -30,8 +30,11 @@ class GpuCusolverSolve(Op): ...@@ -30,8 +30,11 @@ class GpuCusolverSolve(Op):
__props__ = ('trans',) __props__ = ('trans',)
def __init__(self, trans='N'): def __init__(self, trans='N', inplace=False):
self.trans = trans self.trans = trans
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0, 1]}
super(GpuCusolverSolve, self).__init__() super(GpuCusolverSolve, self).__init__()
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
...@@ -52,7 +55,7 @@ class GpuCusolverSolve(Op): ...@@ -52,7 +55,7 @@ class GpuCusolverSolve(Op):
return theano.Apply( return theano.Apply(
self, [inp1, inp2], self, [inp1, inp2],
[GpuArrayType('float32', [GpuArrayType('float32',
broadcastable=[False] * inp1.ndim, broadcastable=inp1.broadcastable,
context_name=self.context)()]) context_name=self.context)()])
def make_thunk(self, def make_thunk(self,
...@@ -86,14 +89,12 @@ class GpuCusolverSolve(Op): ...@@ -86,14 +89,12 @@ class GpuCusolverSolve(Op):
assert(len(A.shape) == 2) assert(len(A.shape) == 2)
assert(len(b.shape) == 2) assert(len(b.shape) == 2)
# A is not explicitly converted between C and F order, instead we
# switch the "transpose" flag.
if self.trans in ['T', 'C']: if self.trans in ['T', 'C']:
trans = 0 trans = 1
l, n = A.shape l, n = A.shape
k, m = b.shape k, m = b.shape
elif self.trans == 'N': elif self.trans == 'N':
trans = 1 trans = 0
n, l = A.shape n, l = A.shape
k, m = b.shape k, m = b.shape
else: else:
...@@ -106,13 +107,18 @@ class GpuCusolverSolve(Op): ...@@ -106,13 +107,18 @@ class GpuCusolverSolve(Op):
lda = max(1, n) lda = max(1, n)
ldb = max(1, k, m) ldb = max(1, k, m)
if trans == 0: # We copy A and b as cusolver operates inplace
A = gpuarray.asfortranarray(A) if not self.inplace:
trans = 1 A = gpuarray.array(A, copy=True)
b = gpuarray.array(b, copy=True, order='F')
A_ptr = A.gpudata A_ptr = A.gpudata
# We copy b as cusolver operates inplace b_ptr = b.gpudata
b_cpy = gpuarray.array(b, copy=True, order='F')
b_ptr = b_cpy.gpudata # cusolver expects a F ordered matrix, but A is not explicitly
# converted between C and F order, instead we switch the
# "transpose" flag.
if A.flags['C_CONTIGUOUS']:
trans = 1 - trans
workspace_size = cusolver.cusolverDnSgetrf_bufferSize( workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
cusolver_handle, n, n, A_ptr, lda) cusolver_handle, n, n, A_ptr, lda)
...@@ -145,7 +151,7 @@ class GpuCusolverSolve(Op): ...@@ -145,7 +151,7 @@ class GpuCusolverSolve(Op):
cusolver_handle, trans, n, m, A_ptr, lda, cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr) pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b_cpy z[0] = b
thunk.inputs = inputs thunk.inputs = inputs
thunk.outputs = outputs thunk.outputs = outputs
......
...@@ -5,36 +5,27 @@ import numpy ...@@ -5,36 +5,27 @@ import numpy
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
from .config import mode_with_gpu
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray from theano.gpuarray.linalg import (cusolver_available, gpu_solve)
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda.cusolver import cusolver_available
from theano.sandbox.cuda import cusolver
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if not pycuda_available:
raise SkipTest('Optional package pycuda not available')
if not cusolver_available: if not cusolver_available:
raise SkipTest('Optional package scikits.cuda.cusolver not available') raise SkipTest('Optional package scikits.cuda.cusolver not available')
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
class TestCusolver(unittest.TestCase): class TestCusolver(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val): def run_gpu_solve(self, A_val, x_val, trans='N'):
b_val = numpy.dot(A_val, x_val) if trans == 'N':
b_val = numpy.dot(A_val, x_val)
else:
b_val = numpy.dot(A_val.T, x_val)
A = theano.tensor.matrix("A", dtype="float32") A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32") b = theano.tensor.matrix("b", dtype="float32")
solver = cusolver.gpu_solve(A, b) solver = gpu_solve(A, b, trans)
fn = theano.function([A, b], [solver]) fn = theano.function([A, b], [solver], mode=mode_with_gpu)
res = fn(A_val, b_val) res = fn(A_val, b_val)
x_res = numpy.array(res[0]) x_res = numpy.array(res[0])
utt.assert_allclose(x_res, x_val) utt.assert_allclose(x_res, x_val)
...@@ -69,3 +60,10 @@ class TestCusolver(unittest.TestCase): ...@@ -69,3 +60,10 @@ class TestCusolver(unittest.TestCase):
x_val = numpy.random.uniform(-0.4, 0.4, x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32") (A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val) self.run_gpu_solve(A_val, x_val)
def test_uni_rand_solve_transpose(self):
numpy.random.seed(1)
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4,
(A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val, trans='T')
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论