提交 7c95b025 authored 作者: Caglar's avatar Caglar

Added the opt code.

上级 a2fe5c5d
...@@ -7,6 +7,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, ...@@ -7,6 +7,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
from scikits.cuda import cula from scikits.cuda import cula
from theano.sandbox.cuda import cuda_ndarray
try: try:
from scikits.cuda import cula from scikits.cuda import cula
...@@ -19,13 +20,15 @@ if cula is not None: ...@@ -19,13 +20,15 @@ if cula is not None:
import numpy import numpy
class GpuSolve(GpuOp): class GpuSolve(GpuOp):
""" """
CULA GPU solver OP. CULA GPU solver OP.
trans: Whether to take the transpose of the input matrix or not. By default,
we will take the transpose of the input matrix, before feeding it into the Op.
That is mainly, because that CULA requires inputs to be in Fortran order.
""" """
def __init__(self, trans='N'): def __init__(self, trans='T'):
self.trans = trans self.trans = trans
super(GpuSolve, self).__init__() super(GpuSolve, self).__init__()
...@@ -48,7 +51,11 @@ class GpuSolve(GpuOp): ...@@ -48,7 +51,11 @@ class GpuSolve(GpuOp):
assert inp2.ndim == 2 assert inp2.ndim == 2
return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()]) return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
def make_thunk(self, node, storage_map, _, no_recycling=[]): def make_thunk(self,
node,
storage_map, _,
no_recycling=[]):
from theano.misc.pycuda_utils import to_gpuarray from theano.misc.pycuda_utils import to_gpuarray
inputs = [storage_map[v] for v in node.inputs] inputs = [storage_map[v] for v in node.inputs]
...@@ -56,53 +63,51 @@ class GpuSolve(GpuOp): ...@@ -56,53 +63,51 @@ class GpuSolve(GpuOp):
def thunk(): def thunk():
input_shape = inputs[1][0].shape input_shape = inputs[1][0].shape
#size of the matrices to invert #size of the matrices to invert
z = outputs[0] z = outputs[0]
#Matrix #Matrix
A = inputs[0][0] A = inputs[0][0]
#Solution vectors #Solution vectors
b = inputs[1][0] b = inputs[1][0]
#A_cpy = A.copy() b = cuda_ndarray.dimshuffle(b, 1, 0)
#b_cpy = b.copy() b_cpy = b.copy()
A_pycuda = to_gpuarray(A) A_pycuda = to_gpuarray(A)
b_pycuda = to_gpuarray(b) b_pycuda = to_gpuarray(b)
def cula_gpu_solve(A, b, trans='N'): def cula_gpu_solve(A_, b_, trans='T'):
A_shape = A.shape A_shape = A_.shape
b_shape = b.shape b_shape = b_.shape
assert(len(A_shape) == 2) assert(len(A_shape) == 2)
assert(len(b_shape) == 2) assert(len(b_shape) == 2)
import string
if trans in ['T', 'C']: if trans in ['T', 'C']:
l, n = A_shape l, n = A_shape
k, m = b_shape k, m = b_shape
if n != m:
raise ValueError('A and b must be aligned.')
elif trans in ['N']: elif trans in ['N']:
n, l = A_shape n, l = A_shape
k, m = b_shape k, m = b_shape
if l != m:
raise ValueError('A and b must be aligned.')
else: else:
raise ValueError('Invalid value for trans') raise ValueError('Invalid value for trans')
if n != k:
raise ValueError('A and b must be aligned.')
if trans == 'N':
lda = max(1, n)
else:
lda = max(1, l)
ldb = max(1, k) lda = max(1, n)
ldb = max(1, n, l)
# construct pointer arrays needed for culaDeviceSgels # construct pointer arrays needed for culaDeviceSgels
# Cula requires you to pass a pointer for A and b. # Cula requires you to pass a pointer for A and b.
A_ptr = A.gpudata A_ptr = A_.gpudata
b_ptr = b.gpudata b_ptr = b_.gpudata
cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb) cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
return A, b return A, b
...@@ -116,4 +121,4 @@ class GpuSolve(GpuOp): ...@@ -116,4 +121,4 @@ class GpuSolve(GpuOp):
return thunk return thunk
gpu_solve = GpuSolve(trans="T") gpu_solve = GpuSolve()
...@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import (
GpuSubtensor, GpuAdvancedSubtensor1, GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad) GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias) GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv from theano.scalar.basic_scipy import Erfinv
from theano.sandbox.cuda.elemwise import erfinv_gpu from theano.sandbox.cuda.elemwise import erfinv_gpu
...@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant ...@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
from theano.scan_module import scan_utils, scan_op, scan_opt from theano.scan_module import scan_utils, scan_op, scan_opt
from theano.tensor.blas import _is_real_vector, _is_real_matrix from theano.tensor.blas import _is_real_vector, _is_real_matrix
from theano.tensor import nlinalg from theano.tensor import nlinalg
from theano.tensor import slinalg
from theano.tensor.nnet.Conv3D import Conv3D from theano.tensor.nnet.Conv3D import Conv3D
try: try:
...@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node): ...@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node):
return False return False
@register_opt()
@local_optimizer([gpu_from_host, slinalg.Solve])
def local_gpu_solve(node):
"""
gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
"""
if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0]
if (host_input.owner and
isinstance(host_input.owner.op,
slinalg.Solve)):
x, y = host_input.owner.inputs
return [gpu_solve(gpu_from_host(x), gpu_from_host(y))]
if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu)
for i in node.inputs]):
x, y = node.inputs
return [host_from_gpu(
gpu_solve(gpu_from_host(x),
gpu_from_host(y)))]
return False
@register_opt() @register_opt()
@local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv]) @local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
def local_gpu_gemv(node): def local_gpu_gemv(node):
......
...@@ -28,14 +28,15 @@ class TestCula(unittest.TestCase): ...@@ -28,14 +28,15 @@ class TestCula(unittest.TestCase):
def run_gpu_solve(self, A_val, x_val): def run_gpu_solve(self, A_val, x_val):
b_val = numpy.dot(A_val, x_val) b_val = numpy.dot(A_val, x_val)
x_res = numpy.zeros((x_val.shape[0], x_val.shape[1])).astype("float32") b_val = b_val.T.reshape((b_val.shape[0], b_val.shape[1]))
A = theano.tensor.matrix("A", dtype="float32") A = theano.tensor.matrix("A", dtype="float32")
b = theano.tensor.matrix("b", dtype="float32") b = theano.tensor.matrix("b", dtype="float32")
solver = cula.gpu_solve(A, b) solver = cula.gpu_solve(A, b)
fn = theano.function([A, b], [solver]) fn = theano.function([A, b], [solver])
res = fn(A_val, b_val) res = fn(A_val, b_val)
x_res = numpy.array(res[0]) x_res = numpy.array(res[0])
x_res = x_res.reshape((x_res.shape[1], x_res.shape[0])).T
utt.assert_allclose(x_res, x_val) utt.assert_allclose(x_res, x_val)
def test_diag_solve(self): def test_diag_solve(self):
...@@ -52,11 +53,10 @@ class TestCula(unittest.TestCase): ...@@ -52,11 +53,10 @@ class TestCula(unittest.TestCase):
def test_orth_solve(self): def test_orth_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
A_orth = numpy.linalg.svd(A_val)[0] A_orth = numpy.linalg.svd(A_val)[0]
#import ipdb; ipdb.set_trace()
x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32") x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32")
self.run_gpu_solve(A_orth, x_val) self.run_gpu_solve(A_orth, x_val)
def test_uni_rand_solve(self): def test_uni_rand_solve(self):
A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32") A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32") x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
self.run_gpu_solve(A_val, x_val) self.run_gpu_solve(A_val, x_val)
...@@ -536,12 +536,34 @@ def test_erfinvgpu(): ...@@ -536,12 +536,34 @@ def test_erfinvgpu():
assert numpy.allclose(f(xv), f2(xv)) assert numpy.allclose(f(xv), f2(xv))
def test_local_gpu_solve():
def cmp(a_shp, b_shp):
a0 = numpy.random.uniform(-0.4, 0.4, a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.uniform(-0.4, 0.4, b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'b')
f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
cuda.cula.GpuSolve)
assert cuda.opt.local_gpu_solve.transform(
tensor.slinalg.solve(a, b).owner)
out = f()
assert numpy.allclose(numpy.dot(a0, out), b0)
cmp((6, 6), (6, 1))
cmp((5, 5), (5, 3))
def test_local_gpu_dot_to_dot22dot(): def test_local_gpu_dot_to_dot22dot():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a0 = numpy.random.rand(*a_shp).astype('float32') a0 = numpy.random.rand(*a_shp).astype('float32')
a = cuda.shared_constructor(a0, 'a') a = cuda.shared_constructor(a0, 'a')
b0 = numpy.random.rand(*b_shp).astype('float32') b0 = numpy.random.rand(*b_shp).astype('float32')
b = cuda.shared_constructor(b0, 'a') b = cuda.shared_constructor(b0, 'b')
f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu) f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
assert cuda.opt.local_gpu_dot_to_dot22.transform( assert cuda.opt.local_gpu_dot_to_dot22.transform(
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论