提交 ce066d1d authored 作者: Caglar's avatar Caglar

Added the test.

上级 477b54a4
import theano
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda import GpuOp, CudaNdarray
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.tensor import as_tensor_variable
from scikits.cuda import cula
try:
from scikits.cuda import cula
scikits_cuda_available = True
except ImportError:
scikits_cuda_available = False
def cula_gpu_solve(A, b, trans='N'):
cula.culaInitialize()
A_shape = A.shape
b_shape = b.shape
assert(len(A_shape) == 2)
assert(len(b_shape) == 2)
import string
if trans in ['T', 'C']:
l, n = A_shape
m, k = b_shape
elif trans in ['N']:
n, l = A_shape
k, m = b_shape
else:
raise ValueError('Invalid value for trans')
if n != k:
raise ValueError('A and b must be aligned.')
if trans == 'n':
lda = max(1, n)
else:
lda = max(1, l)
ldb = max(1, k)
# construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b.
A_ptr = A.gpudata
b_ptr = b.gpudata
cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
return A, b
import numpy
class GpuSolve(GpuOp):
"""
Cula Gpu solver OP.
CULA GPU solver OP.
"""
def __init__(self, trans='N'):
self.trans = trans
......@@ -59,7 +29,7 @@ class GpuSolve(GpuOp):
return hash(type(self))
def output_type(self, inp):
return cuda.CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_node(self, inp1, inp2):
inp1 = gpu_contiguous(as_cuda_ndarray_variable(inp1))
......@@ -71,32 +41,58 @@ class GpuSolve(GpuOp):
assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self, node, storage_map, _, _2):
def make_thunk(self, node, storage_map, _, no_recycling=[]):
from theano.misc.pycuda_utils import to_gpuarray
inputs = [storage_map[v] for v in node.inputs]
outputs = [storage_map[v] for v in node.outputs]
def thunk():
input_shape = inputs[0][0].shape
input_shape = inputs[1][0].shape
#size of the matrices to invert
size = input_shape[1]
z = outputs[0]
if z[0] is None or z[0].shape != input_shape:
z[0] = cuda.CudaNdarray.zeros(input_shape)
#Matrix
A = inputs[0][0]
#Solution vectors
b = inputs[0][1]
b = inputs[1][0]
A_pycuda = to_gpuarray(A)
b_pycuda = to_gpuarray(b)
cula_gpu_solve(A_pycuda, b_pycuda, self.trans)
def cula_gpu_solve(A, b):
cula.culaInitialize()
A_shape = A.shape
b_shape = b.shape
assert(len(A_shape) == 2)
assert(len(b_shape) == 2)
if A_shape[0] != A_shape[1]:
raise ValueError('Coefficient matrix should be a square matrix.')
n = A_shape[0]
nrhs = b_shape[1]
#Create the integer pivot vector to store the indices for
#permutation matrix.
ipiv = CudaNdarray.zeros((n,))
ipiv = to_gpuarray(ipiv)
import string
lda = max(1, n)
ldb = max(1, n)
# construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b.
A_ptr = A.gpudata
b_ptr = b.gpudata
ipiv_ptr = ipiv.gpudata
cula.culaDeviceSgesv(n, nrhs, A_ptr, lda, ipiv_ptr, b_ptr, ldb)
return A, b
A, b = cula_gpu_solve(A_pycuda, b_pycuda)
z[0] = b
thunk.inputs = inputs
thunk.outputs = outputs
......
import unittest
import numpy
import theano
from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda.cula import scikits_cuda_available
if not pycuda_available:
raise SkipTest('Optional package pycuda not available')
if not scikits_cuda_available:
raise SkipTest('Optional package scikits.cuda not available')
from theano.sandbox.cuda import cula
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
class TestCula(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val)
x_res = numpy.zeros((x_val.shape[0], x_val.shape[1])).astype("float32")
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
solver = cula.gpu_solve(A, b)
fn = theano.function([A, b], [solver])
res = fn(A_val, b_val)
res[0].get(x_res)
utt.assert_allclose(x_res, x_val)
def test_diag_solve(self):
A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]], dtype="float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
def test_sym_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_sym = (A_val + A_val.T) / 2.0
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_sym, x_val)
def test_orth_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0]
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], A_orth.shape[0])).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_val, x_val)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论