Unverified 提交 813faf6a authored 作者: abergeron's avatar abergeron 提交者: GitHub

Merge pull request #6653 from wonghang/potrf64_and_Lop

float64 support for GpuCholesky, GpuCusolverSolve, GpuCublasTriangularSolve and their L_op
...@@ -1712,3 +1712,114 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off, ...@@ -1712,3 +1712,114 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
def c_code_cache_version(self): def c_code_cache_version(self):
return (10,) return (10,)
class GpuTri(GpuKernelBase, Op):
"""
Tri for GPU.
"""
__props__ = ('dtype', 'context_name')
_f16_ok = True
def __init__(self, dtype=None, context_name=None):
if dtype is None:
dtype = config.floatX
self.dtype = dtype
self.context_name = context_name
def get_params(self, node):
return get_context(self.context_name)
def make_node(self, n, m, k):
n = tensor.as_tensor_variable(n)
m = tensor.as_tensor_variable(m)
k = tensor.as_tensor_variable(k)
assert n.ndim == 0
assert m.ndim == 0
assert k.ndim == 0
otype = GpuArrayType(dtype=self.dtype,
broadcastable=(False, False),
context_name=self.context_name)
return Apply(self, [n, m, k], [otype()])
def infer_shape(self, node, in_shapes):
out_shape = [node.inputs[0], node.inputs[1]]
return [out_shape]
def grad(self, inp, grads):
return [grad_undefined(self, i, inp[i])
for i in xrange(3)]
def gpu_kernels(self, node, name):
code = """#include "cluda.h"
KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
ga_size n, ga_size m, ga_ssize k) {
a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
ga_ssize coff = max(k, (ga_ssize) 0);
ga_ssize roff = -min(k, (ga_ssize) 0);
for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
a[(i + roff)*m + j] = %(write_a)s(1);
}
}
}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
name=name, write_a=write_w(self.dtype))
return [Kernel(
code=code, name="tri",
params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
gpuarray.SIZE, gpuarray.SSIZE],
flags=Kernel.get_flags(self.dtype),
objvar='k_tri_' + name)]
def c_code(self, node, name, inp, out, sub):
if len(inp) == 2:
n, m = inp
k = 0
elif len(inp) == 3:
n, m, k = inp
z, = out
fail = sub['fail']
ctx = sub['params']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
size_t dims[2] = {0, 0};
size_t ls, gs;
ssize_t k;
int err;
dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
Py_CLEAR(%(z)s);
%(z)s = pygpu_zeros(2, dims,
%(typecode)s,
GA_C_ORDER,
%(ctx)s, Py_None);
if (%(z)s == NULL) {
%(fail)s
}
ls = 1;
gs = 256;
err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
dims[0], dims[1], k);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
GpuKernel_error(&%(kname)s, err),
(unsigned long)dims[0], (unsigned long)dims[1]);
%(fail)s;
}
""" % locals()
return s
def c_code_cache_version(self):
return (1,)
...@@ -66,6 +66,26 @@ if cusolver_available: ...@@ -66,6 +66,26 @@ if cusolver_available:
ldb, int(devInfo)) ldb, int(devInfo))
cusolver.cusolverCheckStatus(status) cusolver.cusolverCheckStatus(status)
# DPOTRS
# TODO: Are they still missing in skucda?
cusolver._libcusolver.cusolverDnDpotrs.restype = int
cusolver._libcusolver.cusolverDnDpotrs.argtypes = [cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p,
cusolver.ctypes.c_int,
cusolver.ctypes.c_void_p]
def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda,
B, ldb, devInfo):
status = cusolver._libcusolver.cusolverDnDpotrs(handle, uplo, n, nrhs,
int(A), lda, int(B),
ldb, int(devInfo))
cusolver.cusolverCheckStatus(status)
def attach_cusolver_handle_to_context(ctx): def attach_cusolver_handle_to_context(ctx):
handle = getattr(ctx, 'cusolver_handle', None) handle = getattr(ctx, 'cusolver_handle', None)
...@@ -125,15 +145,13 @@ class GpuCusolverSolve(Op): ...@@ -125,15 +145,13 @@ class GpuCusolverSolve(Op):
inp1 = gpu_contiguous(inp1) inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2) inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2 assert inp1.ndim == 2
assert inp2.ndim == 2 assert inp2.ndim == 2
assert inp1.dtype == 'float32' assert inp1.dtype == inp2.dtype
assert inp2.dtype == 'float32'
return theano.Apply( return theano.Apply(
self, [inp1, inp2], self, [inp1, inp2],
[GpuArrayType('float32', [GpuArrayType(inp1.dtype,
broadcastable=inp1.broadcastable, broadcastable=inp1.broadcastable,
context_name=context_name)()]) context_name=context_name)()])
...@@ -192,12 +210,29 @@ class GpuCusolverSolve(Op): ...@@ -192,12 +210,29 @@ class GpuCusolverSolve(Op):
if A.flags['C_CONTIGUOUS']: if A.flags['C_CONTIGUOUS']:
trans = 1 - trans trans = 1 - trans
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
potrs = cusolverDnSpotrs
getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
getrf = cusolver.cusolverDnSgetrf
getrs = cusolver.cusolverDnSgetrs
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
potrs = cusolverDnDpotrs
getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
getrf = cusolver.cusolverDnDgetrf
getrs = cusolver.cusolverDnDgetrs
else:
raise ValueError("Unsupported dtype")
if self.A_structure == 'symmetric': if self.A_structure == 'symmetric':
with context: with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize( workspace_size = potrf_bufferSize(
context.cusolver_handle, 0, n, A_ptr, lda) context.cusolver_handle, 0, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context) context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context)
...@@ -206,22 +241,22 @@ class GpuCusolverSolve(Op): ...@@ -206,22 +241,22 @@ class GpuCusolverSolve(Op):
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
with context: with context:
cusolver.cusolverDnSpotrf( potrf(
context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr, context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
workspace_size, dev_info_ptr) workspace_size, dev_info_ptr)
self.check_dev_info(dev_info) self.check_dev_info(dev_info)
cusolverDnSpotrs( potrs(
context.cusolver_handle, 0, n, m, A_ptr, lda, context.cusolver_handle, 0, n, m, A_ptr, lda,
b_ptr, ldb, dev_info_ptr) b_ptr, ldb, dev_info_ptr)
else: else:
# general case for A # general case for A
with context: with context:
workspace_size = cusolver.cusolverDnSgetrf_bufferSize( workspace_size = getrf_bufferSize(
context.cusolver_handle, n, n, A_ptr, lda) context.cusolver_handle, n, n, A_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context) context=context)
pivots = pygpu.zeros(n, dtype='int32', context=context) pivots = pygpu.zeros(n, dtype='int32', context=context)
...@@ -233,17 +268,29 @@ class GpuCusolverSolve(Op): ...@@ -233,17 +268,29 @@ class GpuCusolverSolve(Op):
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
with context: with context:
cusolver.cusolverDnSgetrf( getrf(
context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr, context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
pivots_ptr, dev_info_ptr) pivots_ptr, dev_info_ptr)
self.check_dev_info(dev_info) self.check_dev_info(dev_info)
cusolver.cusolverDnSgetrs( getrs(
context.cusolver_handle, trans, n, m, A_ptr, lda, context.cusolver_handle, trans, n, m, A_ptr, lda,
pivots_ptr, b_ptr, ldb, dev_info_ptr) pivots_ptr, b_ptr, ldb, dev_info_ptr)
z[0] = b z[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
c = outputs[0]
c_bar = output_gradients[0]
# FIXME: triangular structure would use GpuCublasTriangularsolve?
# no need to handle A_structure like slinalg.py?
trans_solve_op = GpuCusolverSolve('general')
b_bar = trans_solve_op(A.T, c_bar)
A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
return [A_bar, b_bar]
class GpuCublasTriangularSolve(Op): class GpuCublasTriangularSolve(Op):
""" """
...@@ -266,7 +313,8 @@ class GpuCublasTriangularSolve(Op): ...@@ -266,7 +313,8 @@ class GpuCublasTriangularSolve(Op):
def make_node(self, inp1, inp2): def make_node(self, inp1, inp2):
if not cublas_available: if not cublas_available:
raise RuntimeError('CUBLAS is not available and ' raise RuntimeError('CUBLAS is not available and '
'GpuCublasTriangularSolve Op can not be constructed.') 'GpuCublasTriangularSolve Op '
'can not be constructed.')
context_name = infer_context_name(inp1, inp2) context_name = infer_context_name(inp1, inp2)
inp1 = as_gpuarray_variable(inp1, context_name) inp1 = as_gpuarray_variable(inp1, context_name)
...@@ -275,14 +323,12 @@ class GpuCublasTriangularSolve(Op): ...@@ -275,14 +323,12 @@ class GpuCublasTriangularSolve(Op):
inp1 = gpu_contiguous(inp1) inp1 = gpu_contiguous(inp1)
inp2 = gpu_contiguous(inp2) inp2 = gpu_contiguous(inp2)
# this op can only operate on float32 matrices
assert inp1.ndim == 2 assert inp1.ndim == 2
assert inp2.ndim in [1, 2] assert inp2.ndim in [1, 2]
assert inp1.dtype == 'float32' assert inp1.dtype == inp2.dtype
assert inp2.dtype == 'float32'
return theano.Apply(self, [inp1, inp2], return theano.Apply(self, [inp1, inp2],
[GpuArrayType('float32', [GpuArrayType(inp1.dtype,
broadcastable=inp2.broadcastable, broadcastable=inp2.broadcastable,
context_name=context_name)()]) context_name=context_name)()])
...@@ -347,17 +393,43 @@ class GpuCublasTriangularSolve(Op): ...@@ -347,17 +393,43 @@ class GpuCublasTriangularSolve(Op):
# indicates elements on diagonal of matrix A may not be unity # indicates elements on diagonal of matrix A may not be unity
diag = 'n' diag = 'n'
if A.dtype == 'float32':
trsv = cublas.cublasStrsv
trsm = cublas.cublasStrsm
elif A.dtype == 'float64':
trsv = cublas.cublasDtrsv
trsm = cublas.cublasDtrsm
else:
raise ValueError("Unsupported dtype")
with ctx: with ctx:
if b.ndim == 1: if b.ndim == 1:
# matrix vector solve # matrix vector solve
cublas.cublasStrsv(ctx.cublas_handle, uplo, trans, diag, n, trsv(ctx.cublas_handle, uplo, trans, diag, n,
A_ptr, lda, b_ptr, 1) A_ptr, lda, b_ptr, 1)
else: else:
cublas.cublasStrsm(ctx.cublas_handle, side, uplo, trans, diag, trsm(ctx.cublas_handle, side, uplo, trans, diag,
n, m, alpha, A_ptr, lda, b_ptr, ldb) n, m, alpha, A_ptr, lda, b_ptr, ldb)
x[0] = b x[0] = b
def L_op(self, inputs, outputs, output_gradients):
# Modified from theano/tensor/slinalg.py
A, b = inputs
c = outputs[0]
c_bar = output_gradients[0]
trans_solve_op = GpuCublasTriangularSolve(not self.lower)
b_bar = trans_solve_op(A.T, c_bar)
A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
if self.lower:
A_bar = tensor.tril(A_bar)
else:
A_bar = tensor.triu(A_bar)
return [A_bar, b_bar]
def gpu_solve(A, b, A_structure='general', trans='N'): def gpu_solve(A, b, A_structure='general', trans='N'):
if A_structure == 'lower': if A_structure == 'lower':
...@@ -368,6 +440,14 @@ def gpu_solve(A, b, A_structure='general', trans='N'): ...@@ -368,6 +440,14 @@ def gpu_solve(A, b, A_structure='general', trans='N'):
return GpuCusolverSolve(A_structure, trans)(A, b) return GpuCusolverSolve(A_structure, trans)(A, b)
def gpu_solve_lower_triangular(A, b, trans='N'):
return GpuCublasTriangularSolve(True, trans)(A, b)
def gpu_solve_upper_triangular(A, b, trans='N'):
return GpuCublasTriangularSolve(False, trans)(A, b)
class GpuCholesky(Op): class GpuCholesky(Op):
""" """
CUSOLVER GPU Cholesky Op. CUSOLVER GPU Cholesky Op.
...@@ -401,7 +481,8 @@ class GpuCholesky(Op): ...@@ -401,7 +481,8 @@ class GpuCholesky(Op):
raise RuntimeError('CUSOLVER is not available and ' raise RuntimeError('CUSOLVER is not available and '
'GpuCholesky Op can not be constructed.') 'GpuCholesky Op can not be constructed.')
if skcuda.__version__ <= '0.5.1': if skcuda.__version__ <= '0.5.1':
warnings.warn('The GpuCholesky op requires scikit-cuda > 0.5.1 to work with CUDA 8') warnings.warn('The GpuCholesky op requires scikit-cuda > '
'0.5.1 to work with CUDA 8')
if not pygpu_available: if not pygpu_available:
raise RuntimeError('Missing pygpu or triu/tril functions.' raise RuntimeError('Missing pygpu or triu/tril functions.'
'Install or update libgpuarray.') 'Install or update libgpuarray.')
...@@ -411,11 +492,7 @@ class GpuCholesky(Op): ...@@ -411,11 +492,7 @@ class GpuCholesky(Op):
inp = gpu_contiguous(inp) inp = gpu_contiguous(inp)
# this op can only operate on float32 matrices
# because of current implementation of triu/tril.
# TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
assert inp.ndim == 2 assert inp.ndim == 2
assert inp.dtype == 'float32'
return theano.Apply(self, [inp], [inp.type()]) return theano.Apply(self, [inp], [inp.type()])
...@@ -453,11 +530,20 @@ class GpuCholesky(Op): ...@@ -453,11 +530,20 @@ class GpuCholesky(Op):
L_ptr = L.gpudata L_ptr = L.gpudata
if A.dtype == 'float32':
potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
potrf = cusolver.cusolverDnSpotrf
elif A.dtype == 'float64':
potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
potrf = cusolver.cusolverDnDpotrf
else:
raise ValueError("Unsupported dtype")
with context: with context:
workspace_size = cusolver.cusolverDnSpotrf_bufferSize( workspace_size = potrf_bufferSize(
context.cusolver_handle, l_parameter, n, L_ptr, lda) context.cusolver_handle, l_parameter, n, L_ptr, lda)
workspace = pygpu.zeros(workspace_size, dtype='float32', workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
context=context) context=context)
dev_info = pygpu.zeros((1,), dtype='int32', context=context) dev_info = pygpu.zeros((1,), dtype='int32', context=context)
...@@ -465,9 +551,8 @@ class GpuCholesky(Op): ...@@ -465,9 +551,8 @@ class GpuCholesky(Op):
workspace_ptr = workspace.gpudata workspace_ptr = workspace.gpudata
dev_info_ptr = dev_info.gpudata dev_info_ptr = dev_info.gpudata
cusolver.cusolverDnSpotrf( potrf(context.cusolver_handle, l_parameter, n, L_ptr,
context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr, lda, workspace_ptr, workspace_size, dev_info_ptr)
workspace_size, dev_info_ptr)
val_dev_info = np.asarray(dev_info)[0] val_dev_info = np.asarray(dev_info)[0]
if val_dev_info > 0: if val_dev_info > 0:
...@@ -483,6 +568,42 @@ class GpuCholesky(Op): ...@@ -483,6 +568,42 @@ class GpuCholesky(Op):
outputs[0][0] = L outputs[0][0] = L
def L_op(self, inputs, outputs, gradients):
# Modified from theano/tensor/slinalg.py
# No handling for on_error = 'nan'
dz = gradients[0]
chol_x = outputs[0]
# this is for nan mode
#
# ok = ~tensor.any(tensor.isnan(chol_x))
# chol_x = tensor.switch(ok, chol_x, 1)
# dz = tensor.switch(ok, dz, 1)
# deal with upper triangular by converting to lower triangular
if not self.lower:
chol_x = chol_x.T
dz = dz.T
def tril_and_halve_diagonal(mtx):
"""Extracts lower triangle of square matrix and halves diagonal."""
return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
def conjugate_solve_triangular(outer, inner):
"""Computes L^{-T} P L^{-1} for lower-triangular L."""
return gpu_solve_upper_triangular(
outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
s = conjugate_solve_triangular(
chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
if self.lower:
grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
else:
grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))
return [grad]
def gpu_cholesky(A, lower=True): def gpu_cholesky(A, lower=True):
return GpuCholesky(lower)(A) return GpuCholesky(lower)(A)
...@@ -498,7 +619,8 @@ class GpuMagmaBase(COp): ...@@ -498,7 +619,8 @@ class GpuMagmaBase(COp):
'gpuarray_helper.h', 'magma.h'] 'gpuarray_helper.h', 'magma.h']
def c_header_dirs(self): def c_header_dirs(self):
dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(), config.cuda.include_path] dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(),
config.cuda.include_path]
if config.magma.include_path: if config.magma.include_path:
dirs.append(config.magma.include_path) dirs.append(config.magma.include_path)
return dirs return dirs
......
...@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, gpu_contiguous, GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin) GpuEye, GpuTri, gpu_join, GpuJoin)
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch, from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace, gpugemm_no_inplace, gpugemm_inplace,
gpugemmbatch_no_inplace, gpugemmbatch_no_inplace,
...@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer): ...@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer):
if (not move_to_GPU and if (not move_to_GPU and
isinstance(node.op, (theano.tensor.Alloc, isinstance(node.op, (theano.tensor.Alloc,
theano.tensor.AllocEmpty, theano.tensor.AllocEmpty,
theano.tensor.basic.Eye))): theano.tensor.basic.Eye,
theano.tensor.basic.Tri))):
# If the Alloc[Empty] have a client that will be moved # If the Alloc[Empty] have a client that will be moved
# to the GPU, we should move the Alloc* on the GPU. # to the GPU, we should move the Alloc* on the GPU.
...@@ -1412,6 +1413,13 @@ def local_gpua_eye(op, context_name, inputs, outputs): ...@@ -1412,6 +1413,13 @@ def local_gpua_eye(op, context_name, inputs, outputs):
return GpuEye(dtype=op.dtype, context_name=context_name) return GpuEye(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile')
@op_lifter([tensor.basic.Tri])
@register_opt2([tensor.basic.Tri], 'fast_compile')
def local_gpua_tri(op, context_name, inputs, outputs):
return GpuTri(dtype=op.dtype, context_name=context_name)
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias]) @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
@register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile') @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
...@@ -2583,7 +2591,7 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs): ...@@ -2583,7 +2591,7 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs):
@op_lifter([slinalg.Solve]) @op_lifter([slinalg.Solve])
@register_opt2([theano.tensor.slinalg.Solve], 'fast_compile') @register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
def local_gpu_solve(op, context_name, inputs, outputs): def local_gpu_solve(op, context_name, inputs, outputs):
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return return
if op.A_structure not in MATRIX_STRUCTURES_SOLVE: if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
return return
...@@ -2609,7 +2617,8 @@ def local_gpu_solve(op, context_name, inputs, outputs): ...@@ -2609,7 +2617,8 @@ def local_gpu_solve(op, context_name, inputs, outputs):
def local_inplace_gpu_solve(node): def local_inplace_gpu_solve(node):
if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace: if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
with inherit_stack_trace(node.outputs): with inherit_stack_trace(node.outputs):
return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans, return [GpuCusolverSolve(A_structure=node.op.A_structure,
trans=node.op.trans,
inplace=True)(*node.inputs)] inplace=True)(*node.inputs)]
...@@ -2617,7 +2626,7 @@ def local_inplace_gpu_solve(node): ...@@ -2617,7 +2626,7 @@ def local_inplace_gpu_solve(node):
def local_gpu_cholesky(op, context_name, inputs, outputs): def local_gpu_cholesky(op, context_name, inputs, outputs):
if not cusolver_available: if not cusolver_available:
return return
if inputs[0].dtype not in ['float16', 'float32']: if inputs[0].dtype not in ['float16', 'float32', 'float64']:
return return
op = GpuCholesky(lower=op.lower, inplace=op.destructive) op = GpuCholesky(lower=op.lower, inplace=op.destructive)
if inputs[0].dtype == 'float16': if inputs[0].dtype == 'float16':
......
...@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context, ...@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context,
from ..basic_ops import ( from ..basic_ops import (
host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu, host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
GpuAlloc, GpuAllocEmpty, GpuContiguous, GpuAlloc, GpuAllocEmpty, GpuContiguous,
gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous) gpu_join, GpuJoin, GpuSplit, GpuEye, GpuTri,
gpu_contiguous)
from ..elemwise import GpuDimShuffle, GpuElemwise from ..elemwise import GpuDimShuffle, GpuElemwise
from ..subtensor import GpuSubtensor from ..subtensor import GpuSubtensor
...@@ -497,3 +498,112 @@ def test_Gpujoin_inplace(): ...@@ -497,3 +498,112 @@ def test_Gpujoin_inplace():
if not isinstance(mode_with_gpu, theano.compile.DebugMode): if not isinstance(mode_with_gpu, theano.compile.DebugMode):
assert x.get_value(borrow=True, return_internal_type=True) is f(0) assert x.get_value(borrow=True, return_internal_type=True) is f(0)
assert np.allclose(f(0), [3, 4, 5]) assert np.allclose(f(0), [3, 4, 5])
def test_gpu_tril_triu():
def check_l(m, k=0):
m_symb = T.matrix(dtype=m.dtype)
k_symb = T.iscalar()
f = theano.function([m_symb, k_symb],
T.tril(m_symb, k_symb),
mode=mode_with_gpu)
result = f(m, k)
assert np.allclose(result, np.tril(m, k))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
def check_u(m, k=0):
m_symb = T.matrix(dtype=m.dtype)
k_symb = T.iscalar()
f = theano.function([m_symb, k_symb],
T.triu(m_symb, k_symb),
mode=mode_with_gpu)
result = f(m, k)
assert np.allclose(result, np.triu(m, k))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
utt.seed_rng()
test_rng = np.random.RandomState(seed=utt.fetch_seed())
for dtype in ['float64', 'float32', 'float16']:
# try a big one
m = np.asarray(test_rng.rand(5000, 5000) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
m = np.asarray(test_rng.rand(10, 10) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
m = np.asarray(test_rng.rand(10, 5) * 2 - 1, dtype=dtype)
yield check_l, m, 0
yield check_l, m, 1
yield check_l, m, -1
yield check_u, m, 0
yield check_u, m, 1
yield check_u, m, -1
def test_gputri():
def check(dtype, N, M_=None, k=0):
# Theano does not accept None as a tensor.
# So we must use a real value.
M = M_
# Currently DebugMode does not support None as inputs even if this is
# allowed.
if M is None:
M = N
N_symb = T.iscalar()
M_symb = T.iscalar()
k_symb = T.iscalar()
out = T.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
f = theano.function([N_symb, M_symb, k_symb],
out,
mode=mode_with_gpu)
result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
assert result.dtype == np.dtype(dtype)
assert any([isinstance(node.op, GpuTri)
for node in f.maker.fgraph.toposort()])
for dtype in ['float64', 'float32', 'int32', 'float16']:
# try a big one
yield check, dtype, 1000, 1000, 0
yield check, dtype, 1000, 1000, -400
yield check, dtype, 1000, 1000, 400
yield check, dtype, 5
# M != N, k = 0
yield check, dtype, 3, 5
yield check, dtype, 5, 3
# N == M, k != 0
yield check, dtype, 3, 3, 1
yield check, dtype, 3, 3, -1
# N < M, k != 0
yield check, dtype, 3, 5, 1
yield check, dtype, 3, 5, -1
# N > M, k != 0
yield check, dtype, 5, 3, 1
yield check, dtype, 5, 3, -1
# k > M, -k > N, k > M, k > N
yield check, dtype, 5, 3, 3
yield check, dtype, 3, 5, 3
yield check, dtype, 5, 3, -3
yield check, dtype, 3, 5, -3
yield check, dtype, 5, 3, 6
yield check, dtype, 3, 5, -6
...@@ -7,11 +7,14 @@ from numpy.linalg.linalg import LinAlgError ...@@ -7,11 +7,14 @@ from numpy.linalg.linalg import LinAlgError
import theano import theano
from theano import config from theano import config
from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky, from theano.gpuarray.linalg import (GpuCusolverSolve, GpuCublasTriangularSolve,
GpuCholesky, GpuMagmaCholesky,
GpuMagmaEigh, GpuMagmaMatrixInverse, GpuMagmaEigh, GpuMagmaMatrixInverse,
GpuMagmaQR, GpuMagmaSVD, GpuMagmaQR, GpuMagmaSVD,
cusolver_available, gpu_matrix_inverse, cusolver_available, gpu_matrix_inverse,
gpu_solve, gpu_svd, gpu_qr) gpu_cholesky,
gpu_solve, gpu_solve_lower_triangular,
gpu_svd, gpu_qr)
from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull, from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
QRIncomplete, eigh, matrix_inverse, qr) QRIncomplete, eigh, matrix_inverse, qr)
from theano.tensor.slinalg import Cholesky, cholesky, imported_scipy from theano.tensor.slinalg import Cholesky, cholesky, imported_scipy
...@@ -20,6 +23,7 @@ from theano.tests import unittest_tools as utt ...@@ -20,6 +23,7 @@ from theano.tests import unittest_tools as utt
from .. import gpuarray_shared_constructor from .. import gpuarray_shared_constructor
from .config import mode_with_gpu, mode_without_gpu from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand from .test_basic_ops import rand
from nose.tools import assert_raises
class TestCusolver(unittest.TestCase): class TestCusolver(unittest.TestCase):
...@@ -122,6 +126,41 @@ class TestCusolver(unittest.TestCase): ...@@ -122,6 +126,41 @@ class TestCusolver(unittest.TestCase):
fn = theano.function([A, b], [solver], mode=mode_with_gpu) fn = theano.function([A, b], [solver], mode=mode_with_gpu)
self.assertRaises(LinAlgError, fn, A_val, x_val) self.assertRaises(LinAlgError, fn, A_val, x_val)
def verify_solve_grad(self, m, n, A_structure, lower, rng):
# ensure diagonal elements of A relatively large to avoid numerical
# precision issues
A_val = (rng.normal(size=(m, m)) * 0.5 +
np.eye(m)).astype(config.floatX)
if A_structure == 'lower_triangular':
A_val = np.tril(A_val)
elif A_structure == 'upper_triangular':
A_val = np.triu(A_val)
if n is None:
b_val = rng.normal(size=m).astype(config.floatX)
else:
b_val = rng.normal(size=(m, n)).astype(config.floatX)
eps = None
if config.floatX == "float64":
eps = 2e-8
if A_structure in ('lower_triangular', 'upper_triangular'):
solve_op = GpuCublasTriangularSolve(lower=lower)
else:
solve_op = GpuCusolverSolve(A_structure="general")
utt.verify_grad(solve_op, [A_val, b_val], 3, rng, eps=eps)
def test_solve_grad(self):
rng = np.random.RandomState(utt.fetch_seed())
structures = ['general', 'lower_triangular', 'upper_triangular']
for A_structure in structures:
lower = (A_structure == 'lower_triangular')
# self.verify_solve_grad(5, None, A_structure, lower, rng)
self.verify_solve_grad(6, 1, A_structure, lower, rng)
self.verify_solve_grad(4, 3, A_structure, lower, rng)
# lower should have no effect for A_structure == 'general' so also
# check lower=True case
self.verify_solve_grad(4, 3, 'general', lower=True, rng=rng)
class TestGpuCholesky(unittest.TestCase): class TestGpuCholesky(unittest.TestCase):
...@@ -215,6 +254,98 @@ class TestGpuCholesky(unittest.TestCase): ...@@ -215,6 +254,98 @@ class TestGpuCholesky(unittest.TestCase):
self.assertRaises(LinAlgError, fn, A_val) self.assertRaises(LinAlgError, fn, A_val)
class TestGpuCholesky64(unittest.TestCase):
def setUp(self):
if not cusolver_available:
self.skipTest('Optional package scikits.cuda.cusolver not available')
utt.seed_rng()
def get_gpu_cholesky_func(self, lower=True, inplace=False):
# Helper function to compile function from GPU Cholesky op.
A = theano.tensor.matrix("A", dtype="float64")
cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
chol_A = cholesky_op(A)
return theano.function([A], chol_A, accept_inplace=inplace,
mode=mode_with_gpu)
def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
# Helper function to compare op output to np.cholesky output.
chol_A_val = np.linalg.cholesky(A_val)
if not lower:
chol_A_val = chol_A_val.T
fn = self.get_gpu_cholesky_func(lower, inplace)
res = fn(A_val)
chol_A_res = np.array(res)
utt.assert_allclose(chol_A_res, chol_A_val)
def test_gpu_cholesky_opt(self):
if not imported_scipy:
self.skipTest('SciPy is not enabled, skipping test')
A = theano.tensor.matrix("A", dtype="float64")
fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
assert any([isinstance(node.op, GpuCholesky)
for node in fn.maker.fgraph.toposort()])
def test_invalid_input_fail_non_square(self):
# Invalid Cholesky input test with non-square matrix as input.
A_val = np.random.normal(size=(3, 2)).astype("float64")
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(ValueError, fn, A_val)
def test_invalid_input_fail_vector(self):
# Invalid Cholesky input test with vector as input.
def invalid_input_func():
A = theano.tensor.vector("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
def test_invalid_input_fail_tensor3(self):
# Invalid Cholesky input test with 3D tensor as input.
def invalid_input_func():
A = theano.tensor.tensor3("A", dtype="float64")
GpuCholesky(lower=True, inplace=False)(A)
self.assertRaises(AssertionError, invalid_input_func)
@utt.assertFailure_fast
def test_diag_chol(self):
# Diagonal matrix input Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
# make sure all diagonal elements are positive so positive-definite
A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
@utt.assertFailure_fast
def test_dense_chol_lower(self):
# Dense matrix input lower-triangular Cholesky test.
for lower in [True, False]:
for inplace in [True, False]:
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = M.dot(M) will be positive definite for all non-singular M
A_val = M_val.dot(M_val.T)
self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
def test_invalid_input_fail_non_symmetric(self):
# Invalid Cholesky input test with non-symmetric input.
# (Non-symmetric real input must also be non-positive definite).
A_val = None
while True:
A_val = np.random.normal(size=(3, 3)).astype("float64")
if not np.allclose(A_val, A_val.T):
break
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
def test_invalid_input_fail_negative_definite(self):
# Invalid Cholesky input test with negative-definite input.
M_val = np.random.normal(size=(3, 3)).astype("float64")
# A = -M.dot(M) will be negative definite for all non-singular M
A_val = -M_val.dot(M_val.T)
fn = self.get_gpu_cholesky_func(True, False)
self.assertRaises(LinAlgError, fn, A_val)
class TestMagma(unittest.TestCase): class TestMagma(unittest.TestCase):
def setUp(self): def setUp(self):
...@@ -467,3 +598,61 @@ class TestMagma(unittest.TestCase): ...@@ -467,3 +598,61 @@ class TestMagma(unittest.TestCase):
isinstance(node.op, GpuMagmaEigh) isinstance(node.op, GpuMagmaEigh)
for node in fn.maker.fgraph.toposort() for node in fn.maker.fgraph.toposort()
]) ])
# mostly copied from theano/tensor/tests/test_slinalg.py
def test_cholesky_grad():
rng = np.random.RandomState(utt.fetch_seed())
r = rng.randn(5, 5).astype(config.floatX)
# The dots are inside the graph since Cholesky needs separable matrices
# Check the default.
yield (lambda: utt.verify_grad(lambda r: gpu_cholesky(r.dot(r.T)),
[r], 3, rng))
# Explicit lower-triangular.
yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=True)(r.dot(r.T)),
[r], 3, rng))
# Explicit upper-triangular.
yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=False)(r.dot(r.T)),
[r], 3, rng))
def test_cholesky_grad_indef():
x = theano.tensor.matrix()
matrix = np.array([[1, 0.2], [0.2, -2]]).astype(config.floatX)
cholesky = GpuCholesky(lower=True)
chol_f = theano.function([x], theano.tensor.grad(cholesky(x).sum(), [x]))
with assert_raises(LinAlgError):
chol_f(matrix)
# cholesky = GpuCholesky(lower=True, on_error='nan')
# chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
# assert np.all(np.isnan(chol_f(matrix)))
def test_lower_triangular_and_cholesky_grad():
# Random lower triangular system is ill-conditioned.
#
# Reference
# -----------
# Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
# SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
#
# Use smaller number of N when using float32
if config.floatX == 'float64':
N = 100
else:
N = 5
rng = np.random.RandomState(utt.fetch_seed())
r = rng.randn(N, N).astype(config.floatX)
y = rng.rand(N, 1).astype(config.floatX)
def f(r, y):
PD = r.dot(r.T)
L = gpu_cholesky(PD)
A = gpu_solve_lower_triangular(L, y)
AAT = theano.tensor.dot(A, A.T)
B = AAT + theano.tensor.eye(N)
LB = gpu_cholesky(B)
return theano.tensor.sum(theano.tensor.log(theano.tensor.diag(LB)))
yield (lambda: utt.verify_grad(f, [r, y], 3, rng))
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论