提交 7c95b025 authored 作者: Caglar's avatar Caglar

Added the opt code.

上级 a2fe5c5d
......@@ -7,6 +7,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
from theano.tensor import as_tensor_variable
from scikits.cuda import cula
from theano.sandbox.cuda import cuda_ndarray
try:
from scikits.cuda import cula
......@@ -19,13 +20,15 @@ if cula is not None:
import numpy
class GpuSolve(GpuOp):
"""
CULA GPU solver OP.
trans: Whether to take the transpose of the input matrix or not. By default,
we will take the transpose of the input matrix, before feeding it into the Op.
That is mainly, because that CULA requires inputs to be in Fortran order.
"""
def __init__(self, trans='N'):
def __init__(self, trans='T'):
self.trans = trans
super(GpuSolve, self).__init__()
......@@ -48,7 +51,11 @@ class GpuSolve(GpuOp):
assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self, node, storage_map, _, no_recycling=[]):
def make_thunk(self,
node,
storage_map, _,
no_recycling=[]):
from theano.misc.pycuda_utils import to_gpuarray
inputs = [storage_map[v] for v in node.inputs]
......@@ -56,53 +63,51 @@ class GpuSolve(GpuOp):
def thunk():
input_shape = inputs[1][0].shape
#size of the matrices to invert
z = outputs[0]
#Matrix
A = inputs[0][0]
#Solution vectors
b = inputs[1][0]
#A_cpy = A.copy()
#b_cpy = b.copy()
b = cuda_ndarray.dimshuffle(b, 1, 0)
b_cpy = b.copy()
A_pycuda = to_gpuarray(A)
b_pycuda = to_gpuarray(b)
def cula_gpu_solve(A, b, trans='N'):
def cula_gpu_solve(A_, b_, trans='T'):
A_shape = A.shape
b_shape = b.shape
A_shape = A_.shape
b_shape = b_.shape
assert(len(A_shape) == 2)
assert(len(b_shape) == 2)
import string
if trans in ['T', 'C']:
l, n = A_shape
k, m = b_shape
if n != m:
raise ValueError('A and b must be aligned.')
elif trans in ['N']:
n, l = A_shape
k, m = b_shape
if l != m:
raise ValueError('A and b must be aligned.')
else:
raise ValueError('Invalid value for trans')
if n != k:
raise ValueError('A and b must be aligned.')
if trans == 'N':
lda = max(1, n)
else:
lda = max(1, l)
ldb = max(1, k)
lda = max(1, n)
ldb = max(1, n, l)
# construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b.
A_ptr = A.gpudata
b_ptr = b.gpudata
A_ptr = A_.gpudata
b_ptr = b_.gpudata
cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
return A, b
......@@ -116,4 +121,4 @@ class GpuSolve(GpuOp):
return thunk
gpu_solve = GpuSolve(trans="T")
gpu_solve = GpuSolve()
......@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import (
GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import erfinv_gpu
......@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg
from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D
try:
......@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node):
return False
@register_opt()
@local_optimizer([gpu_from_host, slinalg.Solve])
def local_gpu_solve(node):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op,
slinalg.Solve)):
x, y = host_input.owner.inputs
return [gpu_solve(gpu_from_host(x), gpu_from_host(y))]
if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(
gpu_solve(gpu_from_host(x),
gpu_from_host(y)))]
return False
@register_opt()
@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
def local_gpu_gemv(node):
......
......@@ -28,14 +28,15 @@ class TestCula(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val)
x_res = numpy.zeros((x_val.shape[0], x_val.shape[1])).astype("float32")
b_val = b_val.T.reshape((b_val.shape[0], b_val.shape[1]))
A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32")
solver = cula.gpu_solve(A, b)
fn = theano.function([A, b], [solver])
res = fn(A_val, b_val)
x_res = numpy.array(res[0])
x_res = x_res.reshape((x_res.shape[1], x_res.shape[0])).T
utt.assert_allclose(x_res, x_val)
def test_diag_solve(self):
......@@ -52,11 +53,10 @@ class TestCula(unittest.TestCase):
def test_orth_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0]
#import ipdb; ipdb.set_trace()
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val)
......@@ -536,12 +536,34 @@ def test_erfinvgpu():
assert numpy.allclose(f(xv), f2(xv))
def test_local_gpu_solve():
def cmp(a_shp, b_shp):
a0 = numpy.random.uniform(-0.4, 0.4, a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.uniform(-0.4, 0.4, b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'b')
f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
cuda.cula.GpuSolve)
assert cuda.opt.local_gpu_solve.transform(
tensor.slinalg.solve(a, b).owner)
out = f()
assert numpy.allclose(numpy.dot(a0, out), b0)
cmp((6, 6), (6, 1))
cmp((5, 5), (5, 3))
def test_local_gpu_dot_to_dot22dot():
def cmp(a_shp, b_shp):
a0 = numpy.random.rand(*a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.rand(*b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'a')
b = cuda.shared_constructor(b0, 'b')
f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
assert cuda.opt.local_gpu_dot_to_dot22.transform(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论