Unverified 提交 813faf6a authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6653 from wonghang/potrf64_and_Lop

float64 support for GpuCholesky, GpuCusolverSolve, GpuCublasTriangularSolve and their L_op
......@@ -1712,3 +1712,114 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
def c_code_cache_version(self):
return (10,)
class GpuTri(GpuKernelBase, Op):
"""
Tri for GPU.
"""
__props__ = ('dtype', 'context_name')
_f16_ok = True
def __init__(self, dtype=None, context_name=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
self.context_name = context_name
def get_params(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n)
m = tensor.as_tensor_variable(m)
k = tensor.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False),
context_name=self.context_name)
return Apply(self, [n, m, k], [otype()])
def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i])
for i in xrange(3)]
def gpu_kernels(self, node, name):
code = """#include "cluda.h"
KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
ga_ssize coff = max(k, (ga_ssize) 0);
ga_ssize roff = -min(k, (ga_ssize) 0);
for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
a[(i + roff)*m + j] = %(write_a)s(1);
}
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name, write_a=write_w(self.dtype))
return [Kernel(
code=code, name="tri",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SIZE, gpuarray.SSIZE],
flags=Kernel.get_flags(self.dtype),
objvar='k_tri_' + name)]
def c_code(self, node, name, inp, out, sub):
if len(inp) == 2:
n, m = inp
k = 0
elif len(inp) == 3:
n, m, k = inp
z, = out
fail = sub['fail']
ctx = sub['params']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
size_t dims[2] = {0, 0};
size_t ls, gs;
ssize_t k;
int err;
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
Py_CLEAR(%(z)s);
%(z)s = pygpu_zeros(2, dims,
%(typecode)s,
GA_C_ORDER,
%(ctx)s, Py_None);
if (%(z)s == NULL) {
%(fail)s
}
ls = 1;
gs = 256;
err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
dims[0], dims[1], k);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
GpuKernel_error(&%(kname)s, err),
(unsigned long)dims[0], (unsigned long)dims[1]);
%(fail)s;
}
""" % locals()
return s
def c_code_cache_version(self):
return (1,)
差异被折叠。
......@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin)
GpuEye, GpuTri, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace,
gpugemmbatch_no_inplace,
......@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer):
if (not move_to_GPU and
isinstance(node.op, (theano.tensor.Alloc,
theano.tensor.AllocEmpty,
theano.tensor.basic.Eye))):
theano.tensor.basic.Eye,
theano.tensor.basic.Tri))):
# If the Alloc[Empty] have a client that will be moved
# to the GPU, we should move the Alloc* on the GPU.
......@@ -1412,6 +1413,13 @@ def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Tri])
@register_opt2([tensor.basic.Tri], 'fast_compile')
def local_gpua_tri(op, context_name, inputs, outputs):
return GpuTri(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
......@@ -2583,7 +2591,7 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs):
@op_lifter([slinalg.Solve])
@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
def local_gpu_solve(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float16', 'float32']:
if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
return
......@@ -2609,7 +2617,8 @@ def local_gpu_solve(op, context_name, inputs, outputs):
def local_inplace_gpu_solve(node):
if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
with inherit_stack_trace(node.outputs):
return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
return [GpuCusolverSolve(A_structure=node.op.A_structure,
trans=node.op.trans,
inplace=True)(*node.inputs)]
......@@ -2617,7 +2626,7 @@ def local_inplace_gpu_solve(node):
def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available:
return
if inputs[0].dtype not in ['float16', 'float32']:
if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return
op = GpuCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype == 'float16':
......
......@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context,
from ..basic_ops import (
host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
gpu_join, GpuJoin, GpuSplit, GpuEye, GpuTri,
gpu_contiguous)
from ..elemwise import GpuDimShuffle, GpuElemwise
from ..subtensor import GpuSubtensor
......@@ -497,3 +498,112 @@ def test_Gpujoin_inplace():
if not isinstance(mode_with_gpu, theano.compile.DebugMode):
assert x.get_value(borrow=True, return_internal_type=True) is f(0)
assert np.allclose(f(0), [3, 4, 5])
def test_gpu_tril_triu():
def check_l(m, k=0):
m_symb = T.matrix(dtype=m.dtype)
k_symb = T.iscalar()
f = theano.function([m_symb, k_symb],
T.tril(m_symb, k_symb),
mode=mode_with_gpu)
result = f(m, k)
assert np.allclose(result, np.tril(m, k))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
def check_u(m, k=0):
m_symb = T.matrix(dtype=m.dtype)
k_symb = T.iscalar()
f = theano.function([m_symb, k_symb],
T.triu(m_symb, k_symb),
mode=mode_with_gpu)
result = f(m, k)
assert np.allclose(result, np.triu(m, k))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
utt.seed_rng()
test_rng = np.random.RandomState(seed=utt.fetch_seed())
for dtype in ['float64', 'float32', 'float16']:
# try a big one
m = np.asarray(test_rng.rand(5000, 5000) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
m = np.asarray(test_rng.rand(10, 10) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
m = np.asarray(test_rng.rand(10, 5) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
def test_gputri():
def check(dtype, N, M_=None, k=0):
# Theano does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = T.iscalar()
M_symb = T.iscalar()
k_symb = T.iscalar()
out = T.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
f = theano.function([N_symb, M_symb, k_symb],
out,
mode=mode_with_gpu)
result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
for dtype in ['float64', 'float32', 'int32', 'float16']:
# try a big one
yield check, dtype, 1000, 1000, 0
yield check, dtype, 1000, 1000, -400
yield check, dtype, 1000, 1000, 400
yield check, dtype, 5
# M != N, k = 0
yield check, dtype, 3, 5
yield check, dtype, 5, 3
# N == M, k != 0
yield check, dtype, 3, 3, 1
yield check, dtype, 3, 3, -1
# N < M, k != 0
yield check, dtype, 3, 5, 1
yield check, dtype, 3, 5, -1
# N > M, k != 0
yield check, dtype, 5, 3, 1
yield check, dtype, 5, 3, -1
# k > M, -k > N, k > M, k > N
yield check, dtype, 5, 3, 3
yield check, dtype, 3, 5, 3
yield check, dtype, 5, 3, -3
yield check, dtype, 3, 5, -3
yield check, dtype, 5, 3, 6
yield check, dtype, 3, 5, -6
......@@ -7,11 +7,14 @@ from numpy.linalg.linalg import LinAlgError
import theano
from theano import config
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
from theano.gpuarray.linalg import (GpuCusolverSolve, GpuCublasTriangularSolve,
GpuCholesky, GpuMagmaCholesky,
GpuMagmaEigh, GpuMagmaMatrixInverse,
GpuMagmaQR, GpuMagmaSVD,
cusolver_available, gpu_matrix_inverse,
gpu_solve, gpu_svd, gpu_qr)
gpu_cholesky,
gpu_solve, gpu_solve_lower_triangular,
gpu_svd, gpu_qr)
from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
QRIncomplete, eigh, matrix_inverse, qr)
from theano.tensor.slinalg import Cholesky, cholesky, imported_scipy
......@@ -20,6 +23,7 @@ from theano.tests import unittest_tools as utt
from .. import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand
from nose.tools import assert_raises
class TestCusolver(unittest.TestCase):
......@@ -122,6 +126,41 @@ class TestCusolver(unittest.TestCase):
fn = theano.function([A, b], [solver], mode=mode_with_gpu)
self.assertRaises(LinAlgError, fn, A_val, x_val)
def verify_solve_grad(self, m, n, A_structure, lower, rng):
# ensure diagonal elements of A relatively large to avoid numerical
# precision issues
A_val = (rng.normal(size=(m, m)) * 0.5 +
np.eye(m)).astype(config.floatX)
if A_structure == 'lower_triangular':
A_val = np.tril(A_val)
elif A_structure == 'upper_triangular':
A_val = np.triu(A_val)
if n is None:
b_val = rng.normal(size=m).astype(config.floatX)
else:
b_val = rng.normal(size=(m, n)).astype(config.floatX)
eps = None
if config.floatX == "float64":
eps = 2e-8
if A_structure in ('lower_triangular', 'upper_triangular'):
solve_op = GpuCublasTriangularSolve(lower=lower)
else:
solve_op = GpuCusolverSolve(A_structure="general")
utt.verify_grad(solve_op, [A_val, b_val], 3, rng, eps=eps)
def test_solve_grad(self):
rng = np.random.RandomState(utt.fetch_seed())
structures = ['general', 'lower_triangular', 'upper_triangular']
for A_structure in structures:
lower = (A_structure == 'lower_triangular')
# self.verify_solve_grad(5, None, A_structure, lower, rng)
self.verify_solve_grad(6, 1, A_structure, lower, rng)
self.verify_solve_grad(4, 3, A_structure, lower, rng)
# lower should have no effect for A_structure == 'general' so also
# check lower=True case
self.verify_solve_grad(4, 3, 'general', lower=True, rng=rng)
class TestGpuCholesky(unittest.TestCase):
......@@ -215,6 +254,98 @@ class TestGpuCholesky(unittest.TestCase):
self.assertRaises(LinAlgError, fn, A_val)
class TestGpuCholesky64(unittest.TestCase):
def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = theano.tensor.matrix("A", dtype="float64")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace,
mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
if not imported_scipy:
self.skipTest('SciPy is not enabled, skipping test')
A = theano.tensor.matrix("A", dtype="float64")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuCholesky)
for node in fn.maker.fgraph.toposort()])
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float64")
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(ValueError, fn, A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = theano.tensor.vector("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = theano.tensor.tensor3("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
@utt.assertFailure_fast
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
@utt.assertFailure_fast
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float64")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
class TestMagma(unittest.TestCase):
def setUp(self):
......@@ -467,3 +598,61 @@ class TestMagma(unittest.TestCase):
isinstance(node.op, GpuMagmaEigh)
for node in fn.maker.fgraph.toposort()
])
# mostly copied from theano/tensor/tests/test_slinalg.py
def test_cholesky_grad():
rng = np.random.RandomState(utt.fetch_seed())
r = rng.randn(5, 5).astype(config.floatX)
# The dots are inside the graph since Cholesky needs separable matrices
# Check the default.
yield (lambda: utt.verify_grad(lambda r: gpu_cholesky(r.dot(r.T)),
[r], 3, rng))
# Explicit lower-triangular.
yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=True)(r.dot(r.T)),
[r], 3, rng))
# Explicit upper-triangular.
yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=False)(r.dot(r.T)),
[r], 3, rng))
def test_cholesky_grad_indef():
x = theano.tensor.matrix()
matrix = np.array([[1, 0.2], [0.2, -2]]).astype(config.floatX)
cholesky = GpuCholesky(lower=True)
chol_f = theano.function([x], theano.tensor.grad(cholesky(x).sum(), [x]))
with assert_raises(LinAlgError):
chol_f(matrix)
# cholesky = GpuCholesky(lower=True, on_error='nan')
# chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
# assert np.all(np.isnan(chol_f(matrix)))
def test_lower_triangular_and_cholesky_grad():
# Random lower triangular system is ill-conditioned.
#
# Reference
# -----------
# Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
# SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
#
# Use smaller number of N when using float32
if config.floatX == 'float64':
N = 100
else:
N = 5
rng = np.random.RandomState(utt.fetch_seed())
r = rng.randn(N, N).astype(config.floatX)
y = rng.rand(N, 1).astype(config.floatX)
def f(r, y):
PD = r.dot(r.T)
L = gpu_cholesky(PD)
A = gpu_solve_lower_triangular(L, y)
AAT = theano.tensor.dot(A, A.T)
B = AAT + theano.tensor.eye(N)
LB = gpu_cholesky(B)
return theano.tensor.sum(theano.tensor.log(theano.tensor.diag(LB)))
yield (lambda: utt.verify_grad(f, [r, y], 3, rng))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论