提交 c4bcd45e authored 作者: Caglar's avatar Caglar

pascal's changes.

上级 8fbc2d0a
import theano import theano
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp, CudaNdarray from theano.sandbox.cuda import GpuOp, CudaNdarray
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from scikits.cuda import cula from scikits.cuda import cula
try: try:
from scikits.cuda import cula from scikits.cuda import cula
scikits_cuda_available = True scikits_cuda_available = True
except ImportError: except ImportError:
scikits_cuda_available = False scikits_cuda_available = False
if cula is not None:
cula.culaInitialize()
import numpy import numpy
class GpuSolve(GpuOp): class GpuSolve(GpuOp):
...@@ -32,8 +38,8 @@ class GpuSolve(GpuOp): ...@@ -32,8 +38,8 @@ class GpuSolve(GpuOp):
return CudaNdarrayType(broadcastable=[False] * inp.type.ndim) return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
inp1 = gpu_contiguous(as_cuda_ndarray_variable(inp1)) inp1 = as_cuda_ndarray_variable(inp1)
inp2 = gpu_contiguous(as_cuda_ndarray_variable(inp2)) inp2 = as_cuda_ndarray_variable(inp2)
assert inp1.dtype == "float32" assert inp1.dtype == "float32"
assert inp2.dtype == "float32" assert inp2.dtype == "float32"
...@@ -49,20 +55,23 @@ class GpuSolve(GpuOp): ...@@ -49,20 +55,23 @@ class GpuSolve(GpuOp):
def thunk(): def thunk():
input_shape = inputs[1][0].shape input_shape = inputs[1][0].shape
#size of the matrices to invert #size of the matrices to invert
z = outputs[0] z = outputs[0]
#Matrix #Matrix
A = inputs[0][0] A = inputs[0][0]
#Solution vectors #Solution vectors
b = inputs[1][0] b = inputs[1][0]
A_cpy = A.copy()
b_cpy = b.copy()
A_pycuda = to_gpuarray(A) A_pycuda = to_gpuarray(A_cpy)
b_pycuda = to_gpuarray(b) b_pycuda = to_gpuarray(b_cpy)
def cula_gpu_solve(A, b): def cula_gpu_solve(A, b):
cula.culaInitialize()
A_shape = A.shape A_shape = A.shape
b_shape = b.shape b_shape = b.shape
assert(len(A_shape) == 2) assert(len(A_shape) == 2)
...@@ -73,6 +82,7 @@ class GpuSolve(GpuOp): ...@@ -73,6 +82,7 @@ class GpuSolve(GpuOp):
n = A_shape[0] n = A_shape[0]
nrhs = b_shape[1] nrhs = b_shape[1]
#Create the integer pivot vector to store the indices for #Create the integer pivot vector to store the indices for
#permutation matrix. #permutation matrix.
ipiv = CudaNdarray.zeros((n,)) ipiv = CudaNdarray.zeros((n,))
...@@ -84,8 +94,8 @@ class GpuSolve(GpuOp): ...@@ -84,8 +94,8 @@ class GpuSolve(GpuOp):
# construct pointer arrays needed for culaDeviceSgels # construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b. # Cula requires you to pass a pointer for A and b.
A_ptr = A.gpudata A_ptr = A_cpy.gpudata
b_ptr = b.gpudata b_ptr = b_cpy.gpudata
ipiv_ptr = ipiv.gpudata ipiv_ptr = ipiv.gpudata
cula.culaDeviceSgesv(n, nrhs, A_ptr, lda, ipiv_ptr, b_ptr, ldb) cula.culaDeviceSgesv(n, nrhs, A_ptr, lda, ipiv_ptr, b_ptr, ldb)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论