提交 615c255c authored 作者: wonghang's avatar wonghang

Add GpuTri so that L_op of GpuCublasTriangularSolve and GpuCholesky would fully run on GPU

上级 4bb986f9
......@@ -1712,3 +1712,113 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
def c_code_cache_version(self):
return (10,)
class GpuTri(GpuKernelBase, Op):
"""
Tri for GPU.
"""
__props__ = ('dtype', 'context_name')
_f16_ok = True
def __init__(self, dtype=None, context_name=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
self.context_name = context_name
def get_params(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n)
m = tensor.as_tensor_variable(m)
k = tensor.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False),
context_name=self.context_name)
return Apply(self, [n, m, k], [otype()])
def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i])
for i in xrange(3)]
def gpu_kernels(self, node, name):
code = """#include "cluda.h"
KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
ga_ssize coff = max(k, (ga_ssize) 0);
ga_ssize roff = -min(k, (ga_ssize) 0);
for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
a[(i + roff)*m + j] = %(write_a)s(1);
}
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name, write_a=write_w(self.dtype))
return [Kernel(
code=code, name="tri",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SIZE, gpuarray.SSIZE],
flags=Kernel.get_flags(self.dtype),
objvar='k_tri_' + name)]
def c_code(self, node, name, inp, out, sub):
if len(inp) == 2:
n, m = inp
k = 0
elif len(inp) == 3:
n, m, k = inp
z, = out
fail = sub['fail']
ctx = sub['params']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
size_t dims[2] = {0, 0};
size_t ls, gs;
ssize_t k;
int err;
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
Py_CLEAR(%(z)s);
%(z)s = pygpu_zeros(2, dims,
%(typecode)s,
GA_C_ORDER,
%(ctx)s, Py_None);
if (%(z)s == NULL) {
%(fail)s
}
ls = 1;
gs = 256;
err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
dims[0], dims[1], k);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
GpuKernel_error(&%(kname)s, err),
(unsigned long)dims[0], (unsigned long)dims[1]);
%(fail)s;
}
""" % locals()
return s
def c_code_cache_version(self):
return (10,)
......@@ -420,13 +420,8 @@ class GpuCublasTriangularSolve(Op):
trans_solve_op = GpuCublasTriangularSolve(not self.lower)
b_bar = trans_solve_op(A.T, c_bar)
# FIXME: tensor.outer does not appear to use GPU
def gpu_outer(x,y):
return tensor.dot(x.dimshuffle(0,'x'),y.dimshuffle('x',0))
A_bar = -gpu_outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
# FIXME: tensor.tril / tensor.triu has no GPU implementation
if self.lower:
A_bar = tensor.tril(A_bar)
else:
......@@ -584,9 +579,6 @@ class GpuCholesky(Op):
chol_x = chol_x.T
dz = dz.T
# FIXME: tensor.tril / tensor.triu / tensor.diagonal / tensor.diag
# has no GPU implementation
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
......
......@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
GpuEye, GpuTri, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace,
gpugemmbatch_no_inplace,
......@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer):
if (not move_to_GPU and
isinstance(node.op, (theano.tensor.Alloc,
theano.tensor.AllocEmpty,
theano.tensor.basic.Eye))):
theano.tensor.basic.Eye,
theano.tensor.basic.Tri))):
# If the Alloc[Empty] have a client that will be moved
# to the GPU, we should move the Alloc* on the GPU.
......@@ -1411,6 +1412,11 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Tri])
@register_opt2([tensor.basic.Tri], 'fast_compile')
def local_gpua_tri(op, context_name, inputs, outputs):
return GpuTri(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
......
......@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context,
from ..basic_ops import (
host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
gpu_join, GpuJoin, GpuSplit, GpuEye, GpuTri,
gpu_contiguous)
from ..elemwise import GpuDimShuffle, GpuElemwise
from ..subtensor import GpuSubtensor
......@@ -443,7 +444,6 @@ def test_gpueye():
yield check, dtype, 5, 3, 6
yield check, dtype, 3, 5, -6
def test_hostfromgpu_shape_i():
# Test that the shape is lifted over hostfromgpu
......@@ -497,3 +497,111 @@ def test_Gpujoin_inplace():
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
assert x.get_value(borrow=True, return_internal_type=True) is f(0)
assert np.allclose(f(0), [3, 4, 5])
def test_gpu_tril_triu():
def check_l(m, k=0):
m_symb = T.matrix(dtype=m.dtype)
k_symb = T.iscalar()
f = theano.function([m_symb,k_symb],
T.tril(m_symb,k_symb),
mode=mode_with_gpu)
result = f(m, k)
assert np.allclose(result, np.tril(m, k))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
def check_u(m, k=0):
m_symb = T.matrix(dtype=m.dtype)
k_symb = T.iscalar()
f = theano.function([m_symb,k_symb],
T.triu(m_symb,k_symb),
mode=mode_with_gpu)
result = f(m, k)
assert np.allclose(result, np.triu(m, k))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
utt.seed_rng()
test_rng = np.random.RandomState(seed=utt.fetch_seed())
for dtype in ['float64', 'float32', 'float16']:
# try a big one
m = np.asarray(test_rng.rand(5000,5000) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
m = np.asarray(test_rng.rand(10,10) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
m = np.asarray(test_rng.rand(10,5) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
def test_gputri():
def check(dtype, N, M_=None, k=0):
# Theano does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = T.iscalar()
M_symb = T.iscalar()
k_symb = T.iscalar()
out = T.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
f = theano.function([N_symb, M_symb, k_symb],
out,
mode=mode_with_gpu)
result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
print(result)
assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
for dtype in ['float64', 'float32', 'int32', 'float16']:
# try a big one
yield check, dtype, 1000, 1000, 0
yield check, dtype, 1000, 1000, -400
yield check, dtype, 1000, 1000, 400
yield check, dtype, 5
# M != N, k = 0
yield check, dtype, 3, 5
yield check, dtype, 5, 3
# N == M, k != 0
yield check, dtype, 3, 3, 1
yield check, dtype, 3, 3, -1
# N < M, k != 0
yield check, dtype, 3, 5, 1
yield check, dtype, 3, 5, -1
# N > M, k != 0
yield check, dtype, 5, 3, 1
yield check, dtype, 5, 3, -1
# k > M, -k > N, k > M, k > N
yield check, dtype, 5, 3, 3
yield check, dtype, 3, 5, 3
yield check, dtype, 5, 3, -3
yield check, dtype, 3, 5, -3
yield check, dtype, 5, 3, 6
yield check, dtype, 3, 5, -6
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论