Merge pull request #6653 from wonghang/potrf64_and_Lop

float64 support for GpuCholesky, GpuCusolverSolve, GpuCublasTriangularSolve and their L_op

Merge pull request #6653 from wonghang/potrf64_and_Lop
813faf6a · abergeron · GitHub · 21eb5367 · 174f119d · 813faf6a
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -1712,3 +1712,114 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
    def c_code_cache_version(self):
        return (10,)
+class GpuTri(GpuKernelBase, Op):
+    """
+    Tri for GPU.
+    """
+    __props__ = ('dtype', 'context_name')
+    _f16_ok = True
+    def __init__(self, dtype=None, context_name=None):
+        if dtype is None:
+            dtype = config.floatX
+        self.dtype = dtype
+        self.context_name = context_name
+    def get_params(self, node):
+        return get_context(self.context_name)
+    def make_node(self, n, m, k):
+        n = tensor.as_tensor_variable(n)
+        m = tensor.as_tensor_variable(m)
+        k = tensor.as_tensor_variable(k)
+        assert n.ndim == 0
+        assert m.ndim == 0
+        assert k.ndim == 0
+        otype = GpuArrayType(dtype=self.dtype,
+                             broadcastable=(False, False),
+                             context_name=self.context_name)
+        return Apply(self, [n, m, k], [otype()])
+    def infer_shape(self, node, in_shapes):
+        out_shape = [node.inputs[0], node.inputs[1]]
+        return [out_shape]
+    def grad(self, inp, grads):
+        return [grad_undefined(self, i, inp[i])
+                for i in xrange(3)]
+    def gpu_kernels(self, node, name):
+        code = """#include "cluda.h"
+KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
+                ga_size n, ga_size m, ga_ssize k) {
+    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
+    ga_ssize coff = max(k, (ga_ssize) 0);
+    ga_ssize roff = -min(k, (ga_ssize) 0);
+    for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
+        for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
+          a[(i + roff)*m + j] = %(write_a)s(1);
+        }
+    }
+}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
+            name=name, write_a=write_w(self.dtype))
+        return [Kernel(
+                code=code, name="tri",
+                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
+                        gpuarray.SIZE, gpuarray.SSIZE],
+                flags=Kernel.get_flags(self.dtype),
+                objvar='k_tri_' + name)]
+    def c_code(self, node, name, inp, out, sub):
+        if len(inp) == 2:
+            n, m = inp
+            k = 0
+        elif len(inp) == 3:
+            n, m, k = inp
+        z, = out
+        fail = sub['fail']
+        ctx = sub['params']
+        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+        size_t dims[2] = {0, 0};
+        size_t ls, gs;
+        ssize_t k;
+        int err;
+        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
+        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
+        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
+        Py_CLEAR(%(z)s);
+        %(z)s = pygpu_zeros(2, dims,
+                            %(typecode)s,
+                            GA_C_ORDER,
+                            %(ctx)s, Py_None);
+        if (%(z)s == NULL) {
+            %(fail)s
+        }
+        ls = 1;
+        gs = 256;
+        err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
+                       dims[0], dims[1], k);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
+                         GpuKernel_error(&%(kname)s, err),
+                         (unsigned long)dims[0], (unsigned long)dims[1]);
+            %(fail)s;
+        }
+        """ % locals()
+        return s
+    def c_code_cache_version(self):
+        return (1,)
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -66,6 +66,26 @@ if cusolver_available:
                                                        ldb, int(devInfo))
        cusolver.cusolverCheckStatus(status)
+    # DPOTRS
+    # TODO: Are they still missing in skucda?
+    cusolver._libcusolver.cusolverDnDpotrs.restype = int
+    cusolver._libcusolver.cusolverDnDpotrs.argtypes = [cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p,
+                                                       cusolver.ctypes.c_int,
+                                                       cusolver.ctypes.c_void_p]
+    def cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda,
+                         B, ldb, devInfo):
+        status = cusolver._libcusolver.cusolverDnDpotrs(handle, uplo, n, nrhs,
+                                                        int(A), lda, int(B),
+                                                        ldb, int(devInfo))
+        cusolver.cusolverCheckStatus(status)
 def attach_cusolver_handle_to_context(ctx):
    handle = getattr(ctx, 'cusolver_handle', None)
@@ -125,15 +145,13 @@ class GpuCusolverSolve(Op):
        inp1 = gpu_contiguous(inp1)
        inp2 = gpu_contiguous(inp2)
-        # this op can only operate on float32 matrices
        assert inp1.ndim == 2
        assert inp2.ndim == 2
-        assert inp1.dtype == 'float32'
+        assert inp1.dtype == inp2.dtype
-        assert inp2.dtype == 'float32'
        return theano.Apply(
            self, [inp1, inp2],
-            [GpuArrayType('float32',
+            [GpuArrayType(inp1.dtype,
                          broadcastable=inp1.broadcastable,
                          context_name=context_name)()])
@@ -192,12 +210,29 @@ class GpuCusolverSolve(Op):
        if A.flags['C_CONTIGUOUS']:
            trans = 1 - trans
+        if A.dtype == 'float32':
+            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
+            potrf = cusolver.cusolverDnSpotrf
+            potrs = cusolverDnSpotrs
+            getrf_bufferSize = cusolver.cusolverDnSgetrf_bufferSize
+            getrf = cusolver.cusolverDnSgetrf
+            getrs = cusolver.cusolverDnSgetrs
+        elif A.dtype == 'float64':
+            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
+            potrf = cusolver.cusolverDnDpotrf
+            potrs = cusolverDnDpotrs
+            getrf_bufferSize = cusolver.cusolverDnDgetrf_bufferSize
+            getrf = cusolver.cusolverDnDgetrf
+            getrs = cusolver.cusolverDnDgetrs
+        else:
+            raise ValueError("Unsupported dtype")
        if self.A_structure == 'symmetric':
            with context:
-                workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
+                workspace_size = potrf_bufferSize(
                    context.cusolver_handle, 0, n, A_ptr, lda)
-            workspace = pygpu.zeros(workspace_size, dtype='float32',
+            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)
            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
@@ -206,22 +241,22 @@ class GpuCusolverSolve(Op):
            dev_info_ptr = dev_info.gpudata
            with context:
-                cusolver.cusolverDnSpotrf(
+                potrf(
                    context.cusolver_handle, 0, n, A_ptr, lda, workspace_ptr,
                    workspace_size, dev_info_ptr)
                self.check_dev_info(dev_info)
-                cusolverDnSpotrs(
+                potrs(
                    context.cusolver_handle, 0, n, m, A_ptr, lda,
                    b_ptr, ldb, dev_info_ptr)
        else:
            # general case for A
            with context:
-                workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
+                workspace_size = getrf_bufferSize(
                    context.cusolver_handle, n, n, A_ptr, lda)
-            workspace = pygpu.zeros(workspace_size, dtype='float32',
+            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)
            pivots = pygpu.zeros(n, dtype='int32', context=context)
@@ -233,17 +268,29 @@ class GpuCusolverSolve(Op):
            dev_info_ptr = dev_info.gpudata
            with context:
-                cusolver.cusolverDnSgetrf(
+                getrf(
                    context.cusolver_handle, n, n, A_ptr, lda, workspace_ptr,
                    pivots_ptr, dev_info_ptr)
                self.check_dev_info(dev_info)
-                cusolver.cusolverDnSgetrs(
+                getrs(
                    context.cusolver_handle, trans, n, m, A_ptr, lda,
                    pivots_ptr, b_ptr, ldb, dev_info_ptr)
        z[0] = b
+    def L_op(self, inputs, outputs, output_gradients):
+        # Modified from theano/tensor/slinalg.py
+        A, b = inputs
+        c = outputs[0]
+        c_bar = output_gradients[0]
+        # FIXME: triangular structure would use GpuCublasTriangularsolve?
+        # no need to handle A_structure like slinalg.py?
+        trans_solve_op = GpuCusolverSolve('general')
+        b_bar = trans_solve_op(A.T, c_bar)
+        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
+        return [A_bar, b_bar]
 class GpuCublasTriangularSolve(Op):
    """
@@ -266,7 +313,8 @@ class GpuCublasTriangularSolve(Op):
    def make_node(self, inp1, inp2):
        if not cublas_available:
            raise RuntimeError('CUBLAS is not available and '
-                               'GpuCublasTriangularSolve Op can not be constructed.')
+                               'GpuCublasTriangularSolve Op '
+                               'can not be constructed.')
        context_name = infer_context_name(inp1, inp2)
        inp1 = as_gpuarray_variable(inp1, context_name)
@@ -275,14 +323,12 @@ class GpuCublasTriangularSolve(Op):
        inp1 = gpu_contiguous(inp1)
        inp2 = gpu_contiguous(inp2)
-        # this op can only operate on float32 matrices
        assert inp1.ndim == 2
        assert inp2.ndim in [1, 2]
-        assert inp1.dtype == 'float32'
+        assert inp1.dtype == inp2.dtype
-        assert inp2.dtype == 'float32'
        return theano.Apply(self, [inp1, inp2],
-                            [GpuArrayType('float32',
+                            [GpuArrayType(inp1.dtype,
                                          broadcastable=inp2.broadcastable,
                                          context_name=context_name)()])
@@ -347,17 +393,43 @@ class GpuCublasTriangularSolve(Op):
        # indicates elements on diagonal of matrix A may not be unity
        diag = 'n'
+        if A.dtype == 'float32':
+            trsv = cublas.cublasStrsv
+            trsm = cublas.cublasStrsm
+        elif A.dtype == 'float64':
+            trsv = cublas.cublasDtrsv
+            trsm = cublas.cublasDtrsm
+        else:
+            raise ValueError("Unsupported dtype")
        with ctx:
            if b.ndim == 1:
                # matrix vector solve
-                cublas.cublasStrsv(ctx.cublas_handle, uplo, trans, diag, n,
+                trsv(ctx.cublas_handle, uplo, trans, diag, n,
                     A_ptr, lda, b_ptr, 1)
            else:
-                cublas.cublasStrsm(ctx.cublas_handle, side, uplo, trans, diag,
+                trsm(ctx.cublas_handle, side, uplo, trans, diag,
                     n, m, alpha, A_ptr, lda, b_ptr, ldb)
        x[0] = b
+    def L_op(self, inputs, outputs, output_gradients):
+        # Modified from theano/tensor/slinalg.py
+        A, b = inputs
+        c = outputs[0]
+        c_bar = output_gradients[0]
+        trans_solve_op = GpuCublasTriangularSolve(not self.lower)
+        b_bar = trans_solve_op(A.T, c_bar)
+        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
+        if self.lower:
+            A_bar = tensor.tril(A_bar)
+        else:
+            A_bar = tensor.triu(A_bar)
+        return [A_bar, b_bar]
 def gpu_solve(A, b, A_structure='general', trans='N'):
    if A_structure == 'lower':
@@ -368,6 +440,14 @@ def gpu_solve(A, b, A_structure='general', trans='N'):
    return GpuCusolverSolve(A_structure, trans)(A, b)
+def gpu_solve_lower_triangular(A, b, trans='N'):
+    return GpuCublasTriangularSolve(True, trans)(A, b)
+def gpu_solve_upper_triangular(A, b, trans='N'):
+    return GpuCublasTriangularSolve(False, trans)(A, b)
 class GpuCholesky(Op):
    """
    CUSOLVER GPU Cholesky Op.
@@ -401,7 +481,8 @@ class GpuCholesky(Op):
            raise RuntimeError('CUSOLVER is not available and '
                               'GpuCholesky Op can not be constructed.')
        if skcuda.__version__ <= '0.5.1':
-            warnings.warn('The GpuCholesky op requires scikit-cuda > 0.5.1 to work with CUDA 8')
+            warnings.warn('The GpuCholesky op requires scikit-cuda > '
+                          '0.5.1 to work with CUDA 8')
        if not pygpu_available:
            raise RuntimeError('Missing pygpu or triu/tril functions.'
                               'Install or update libgpuarray.')
@@ -411,11 +492,7 @@ class GpuCholesky(Op):
        inp = gpu_contiguous(inp)
-        # this op can only operate on float32 matrices
-        # because of current implementation of triu/tril.
-        # TODO: support float64 for triu/tril in GpuArray and for GpuCholesky/GpuCusolverSolve in Theano.
        assert inp.ndim == 2
-        assert inp.dtype == 'float32'
        return theano.Apply(self, [inp], [inp.type()])
@@ -453,11 +530,20 @@ class GpuCholesky(Op):
        L_ptr = L.gpudata
+        if A.dtype == 'float32':
+            potrf_bufferSize = cusolver.cusolverDnSpotrf_bufferSize
+            potrf = cusolver.cusolverDnSpotrf
+        elif A.dtype == 'float64':
+            potrf_bufferSize = cusolver.cusolverDnDpotrf_bufferSize
+            potrf = cusolver.cusolverDnDpotrf
+        else:
+            raise ValueError("Unsupported dtype")
        with context:
-            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
+            workspace_size = potrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)
-            workspace = pygpu.zeros(workspace_size, dtype='float32',
+            workspace = pygpu.zeros(workspace_size, dtype=A.dtype,
                                    context=context)
            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
@@ -465,9 +551,8 @@ class GpuCholesky(Op):
            workspace_ptr = workspace.gpudata
            dev_info_ptr = dev_info.gpudata
-            cusolver.cusolverDnSpotrf(
+            potrf(context.cusolver_handle, l_parameter, n, L_ptr,
-                context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
+                  lda, workspace_ptr, workspace_size, dev_info_ptr)
-                workspace_size, dev_info_ptr)
            val_dev_info = np.asarray(dev_info)[0]
            if val_dev_info > 0:
@@ -483,6 +568,42 @@ class GpuCholesky(Op):
        outputs[0][0] = L
+    def L_op(self, inputs, outputs, gradients):
+        # Modified from theano/tensor/slinalg.py
+        # No handling for on_error = 'nan'
+        dz = gradients[0]
+        chol_x = outputs[0]
+        # this is for nan mode
+        #
+        # ok = ~tensor.any(tensor.isnan(chol_x))
+        # chol_x = tensor.switch(ok, chol_x, 1)
+        # dz = tensor.switch(ok, dz, 1)
+        # deal with upper triangular by converting to lower triangular
+        if not self.lower:
+            chol_x = chol_x.T
+            dz = dz.T
+        def tril_and_halve_diagonal(mtx):
+            """Extracts lower triangle of square matrix and halves diagonal."""
+            return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)
+        def conjugate_solve_triangular(outer, inner):
+            """Computes L^{-T} P L^{-1} for lower-triangular L."""
+            return gpu_solve_upper_triangular(
+                outer.T, gpu_solve_upper_triangular(outer.T, inner.T).T)
+        s = conjugate_solve_triangular(
+            chol_x, tril_and_halve_diagonal(chol_x.T.dot(dz)))
+        if self.lower:
+            grad = tensor.tril(s + s.T) - tensor.diag(tensor.diagonal(s))
+        else:
+            grad = tensor.triu(s + s.T) - tensor.diag(tensor.diagonal(s))
+        return [grad]
 def gpu_cholesky(A, lower=True):
    return GpuCholesky(lower)(A)
@@ -498,7 +619,8 @@ class GpuMagmaBase(COp):
                'gpuarray_helper.h', 'magma.h']
    def c_header_dirs(self):
-        dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(), config.cuda.include_path]
+        dirs = [gpuarray_helper_inc_dir(), pygpu.get_include(),
+                config.cuda.include_path]
        if config.magma.include_path:
            dirs.append(config.magma.include_path)
        return dirs

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous, gpu_contiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
-                        GpuEye, gpu_join, GpuJoin)
+                        GpuEye, GpuTri, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemm_inplace,
                   gpugemmbatch_no_inplace,
@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer):
            if (not move_to_GPU and
                    isinstance(node.op, (theano.tensor.Alloc,
                                         theano.tensor.AllocEmpty,
-                                         theano.tensor.basic.Eye))):
+                                         theano.tensor.basic.Eye,
+                                         theano.tensor.basic.Tri))):
                # If the Alloc[Empty] have a client that will be moved
                # to the GPU, we should move the Alloc* on the GPU.
@@ -1412,6 +1413,13 @@ def local_gpua_eye(op, context_name, inputs, outputs):
    return GpuEye(dtype=op.dtype, context_name=context_name)
+@register_opt('fast_compile')
+@op_lifter([tensor.basic.Tri])
+@register_opt2([tensor.basic.Tri], 'fast_compile')
+def local_gpua_tri(op, context_name, inputs, outputs):
+    return GpuTri(dtype=op.dtype, context_name=context_name)
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
 @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
@@ -2583,7 +2591,7 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs):
 @op_lifter([slinalg.Solve])
 @register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
 def local_gpu_solve(op, context_name, inputs, outputs):
-    if inputs[0].dtype not in ['float16', 'float32']:
+    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
        return
    if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
        return
@@ -2609,7 +2617,8 @@ def local_gpu_solve(op, context_name, inputs, outputs):
 def local_inplace_gpu_solve(node):
    if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
        with inherit_stack_trace(node.outputs):
-            return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
+            return [GpuCusolverSolve(A_structure=node.op.A_structure,
+                                     trans=node.op.trans,
                                     inplace=True)(*node.inputs)]
@@ -2617,7 +2626,7 @@ def local_inplace_gpu_solve(node):
 def local_gpu_cholesky(op, context_name, inputs, outputs):
    if not cusolver_available:
        return
-    if inputs[0].dtype not in ['float16', 'float32']:
+    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
        return
    op = GpuCholesky(lower=op.lower, inplace=op.destructive)
    if inputs[0].dtype == 'float16':

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context,
 from ..basic_ops import (
    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
    GpuAlloc, GpuAllocEmpty, GpuContiguous,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+    gpu_join, GpuJoin, GpuSplit, GpuEye, GpuTri,
+    gpu_contiguous)
 from ..elemwise import GpuDimShuffle, GpuElemwise
 from ..subtensor import GpuSubtensor
@@ -497,3 +498,112 @@ def test_Gpujoin_inplace():
    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        assert x.get_value(borrow=True, return_internal_type=True) is f(0)
    assert np.allclose(f(0), [3, 4, 5])
+def test_gpu_tril_triu():
+    def check_l(m, k=0):
+        m_symb = T.matrix(dtype=m.dtype)
+        k_symb = T.iscalar()
+        f = theano.function([m_symb, k_symb],
+                            T.tril(m_symb, k_symb),
+                            mode=mode_with_gpu)
+        result = f(m, k)
+        assert np.allclose(result, np.tril(m, k))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+    def check_u(m, k=0):
+        m_symb = T.matrix(dtype=m.dtype)
+        k_symb = T.iscalar()
+        f = theano.function([m_symb, k_symb],
+                            T.triu(m_symb, k_symb),
+                            mode=mode_with_gpu)
+        result = f(m, k)
+        assert np.allclose(result, np.triu(m, k))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+    utt.seed_rng()
+    test_rng = np.random.RandomState(seed=utt.fetch_seed())
+    for dtype in ['float64', 'float32', 'float16']:
+        # try a big one
+        m = np.asarray(test_rng.rand(5000, 5000) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+        m = np.asarray(test_rng.rand(10, 10) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+        m = np.asarray(test_rng.rand(10, 5) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+def test_gputri():
+    def check(dtype, N, M_=None, k=0):
+        # Theano does not accept None as a tensor.
+        # So we must use a real value.
+        M = M_
+        # Currently DebugMode does not support None as inputs even if this is
+        # allowed.
+        if M is None:
+            M = N
+        N_symb = T.iscalar()
+        M_symb = T.iscalar()
+        k_symb = T.iscalar()
+        out = T.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
+        f = theano.function([N_symb, M_symb, k_symb],
+                            out,
+                            mode=mode_with_gpu)
+        result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
+        assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+    for dtype in ['float64', 'float32', 'int32', 'float16']:
+        # try a big one
+        yield check, dtype, 1000, 1000, 0
+        yield check, dtype, 1000, 1000, -400
+        yield check, dtype, 1000, 1000, 400
+        yield check, dtype, 5
+        # M != N, k = 0
+        yield check, dtype, 3, 5
+        yield check, dtype, 5, 3
+        # N == M, k != 0
+        yield check, dtype, 3, 3, 1
+        yield check, dtype, 3, 3, -1
+        # N < M, k != 0
+        yield check, dtype, 3, 5, 1
+        yield check, dtype, 3, 5, -1
+        # N > M, k != 0
+        yield check, dtype, 5, 3, 1
+        yield check, dtype, 5, 3, -1
+        # k > M, -k > N, k > M, k > N
+        yield check, dtype, 5, 3, 3
+        yield check, dtype, 3, 5, 3
+        yield check, dtype, 5, 3, -3
+        yield check, dtype, 3, 5, -3
+        yield check, dtype, 5, 3, 6
+        yield check, dtype, 3, 5, -6
--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -7,11 +7,14 @@ from numpy.linalg.linalg import LinAlgError
 import theano
 from theano import config
-from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
+from theano.gpuarray.linalg import (GpuCusolverSolve, GpuCublasTriangularSolve,
+                                    GpuCholesky, GpuMagmaCholesky,
                                    GpuMagmaEigh, GpuMagmaMatrixInverse,
                                    GpuMagmaQR, GpuMagmaSVD,
                                    cusolver_available, gpu_matrix_inverse,
-                                    gpu_solve, gpu_svd, gpu_qr)
+                                    gpu_cholesky,
+                                    gpu_solve, gpu_solve_lower_triangular,
+                                    gpu_svd, gpu_qr)
 from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
                                   QRIncomplete, eigh, matrix_inverse, qr)
 from theano.tensor.slinalg import Cholesky, cholesky, imported_scipy
@@ -20,6 +23,7 @@ from theano.tests import unittest_tools as utt
 from .. import gpuarray_shared_constructor
 from .config import mode_with_gpu, mode_without_gpu
 from .test_basic_ops import rand
+from nose.tools import assert_raises
 class TestCusolver(unittest.TestCase):
@@ -122,6 +126,41 @@ class TestCusolver(unittest.TestCase):
        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
        self.assertRaises(LinAlgError, fn, A_val, x_val)
+    def verify_solve_grad(self, m, n, A_structure, lower, rng):
+        # ensure diagonal elements of A relatively large to avoid numerical
+        # precision issues
+        A_val = (rng.normal(size=(m, m)) * 0.5 +
+                 np.eye(m)).astype(config.floatX)
+        if A_structure == 'lower_triangular':
+            A_val = np.tril(A_val)
+        elif A_structure == 'upper_triangular':
+            A_val = np.triu(A_val)
+        if n is None:
+            b_val = rng.normal(size=m).astype(config.floatX)
+        else:
+            b_val = rng.normal(size=(m, n)).astype(config.floatX)
+        eps = None
+        if config.floatX == "float64":
+            eps = 2e-8
+        if A_structure in ('lower_triangular', 'upper_triangular'):
+            solve_op = GpuCublasTriangularSolve(lower=lower)
+        else:
+            solve_op = GpuCusolverSolve(A_structure="general")
+        utt.verify_grad(solve_op, [A_val, b_val], 3, rng, eps=eps)
+    def test_solve_grad(self):
+        rng = np.random.RandomState(utt.fetch_seed())
+        structures = ['general', 'lower_triangular', 'upper_triangular']
+        for A_structure in structures:
+            lower = (A_structure == 'lower_triangular')
+            # self.verify_solve_grad(5, None, A_structure, lower, rng)
+            self.verify_solve_grad(6, 1, A_structure, lower, rng)
+            self.verify_solve_grad(4, 3, A_structure, lower, rng)
+        # lower should have no effect for A_structure == 'general' so also
+        # check lower=True case
+        self.verify_solve_grad(4, 3, 'general', lower=True, rng=rng)
 class TestGpuCholesky(unittest.TestCase):
@@ -215,6 +254,98 @@ class TestGpuCholesky(unittest.TestCase):
        self.assertRaises(LinAlgError, fn, A_val)
+class TestGpuCholesky64(unittest.TestCase):
+    def setUp(self):
+        if not cusolver_available:
+            self.skipTest('Optional package scikits.cuda.cusolver not available')
+        utt.seed_rng()
+    def get_gpu_cholesky_func(self, lower=True, inplace=False):
+        # Helper function to compile function from GPU Cholesky op.
+        A = theano.tensor.matrix("A", dtype="float64")
+        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
+        chol_A = cholesky_op(A)
+        return theano.function([A], chol_A, accept_inplace=inplace,
+                               mode=mode_with_gpu)
+    def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
+        # Helper function to compare op output to np.cholesky output.
+        chol_A_val = np.linalg.cholesky(A_val)
+        if not lower:
+            chol_A_val = chol_A_val.T
+        fn = self.get_gpu_cholesky_func(lower, inplace)
+        res = fn(A_val)
+        chol_A_res = np.array(res)
+        utt.assert_allclose(chol_A_res, chol_A_val)
+    def test_gpu_cholesky_opt(self):
+        if not imported_scipy:
+            self.skipTest('SciPy is not enabled, skipping test')
+        A = theano.tensor.matrix("A", dtype="float64")
+        fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
+        assert any([isinstance(node.op, GpuCholesky)
+                    for node in fn.maker.fgraph.toposort()])
+    def test_invalid_input_fail_non_square(self):
+        # Invalid Cholesky input test with non-square matrix as input.
+        A_val = np.random.normal(size=(3, 2)).astype("float64")
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(ValueError, fn, A_val)
+    def test_invalid_input_fail_vector(self):
+        # Invalid Cholesky input test with vector as input.
+        def invalid_input_func():
+            A = theano.tensor.vector("A", dtype="float64")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+    def test_invalid_input_fail_tensor3(self):
+        # Invalid Cholesky input test with 3D tensor as input.
+        def invalid_input_func():
+            A = theano.tensor.tensor3("A", dtype="float64")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+    @utt.assertFailure_fast
+    def test_diag_chol(self):
+        # Diagonal matrix input Cholesky test.
+        for lower in [True, False]:
+            for inplace in [True, False]:
+                # make sure all diagonal elements are positive so positive-definite
+                A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
+                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
+    @utt.assertFailure_fast
+    def test_dense_chol_lower(self):
+        # Dense matrix input lower-triangular Cholesky test.
+        for lower in [True, False]:
+            for inplace in [True, False]:
+                M_val = np.random.normal(size=(3, 3)).astype("float64")
+                # A = M.dot(M) will be positive definite for all non-singular M
+                A_val = M_val.dot(M_val.T)
+                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
+    def test_invalid_input_fail_non_symmetric(self):
+        # Invalid Cholesky input test with non-symmetric input.
+        #    (Non-symmetric real input must also be non-positive definite).
+        A_val = None
+        while True:
+            A_val = np.random.normal(size=(3, 3)).astype("float64")
+            if not np.allclose(A_val, A_val.T):
+                break
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(LinAlgError, fn, A_val)
+    def test_invalid_input_fail_negative_definite(self):
+        # Invalid Cholesky input test with negative-definite input.
+        M_val = np.random.normal(size=(3, 3)).astype("float64")
+        # A = -M.dot(M) will be negative definite for all non-singular M
+        A_val = -M_val.dot(M_val.T)
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(LinAlgError, fn, A_val)
 class TestMagma(unittest.TestCase):
    def setUp(self):
@@ -467,3 +598,61 @@ class TestMagma(unittest.TestCase):
            isinstance(node.op, GpuMagmaEigh)
            for node in fn.maker.fgraph.toposort()
        ])
+# mostly copied from theano/tensor/tests/test_slinalg.py
+def test_cholesky_grad():
+    rng = np.random.RandomState(utt.fetch_seed())
+    r = rng.randn(5, 5).astype(config.floatX)
+    # The dots are inside the graph since Cholesky needs separable matrices
+    # Check the default.
+    yield (lambda: utt.verify_grad(lambda r: gpu_cholesky(r.dot(r.T)),
+                                   [r], 3, rng))
+    # Explicit lower-triangular.
+    yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=True)(r.dot(r.T)),
+                                   [r], 3, rng))
+    # Explicit upper-triangular.
+    yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=False)(r.dot(r.T)),
+                                   [r], 3, rng))
+def test_cholesky_grad_indef():
+    x = theano.tensor.matrix()
+    matrix = np.array([[1, 0.2], [0.2, -2]]).astype(config.floatX)
+    cholesky = GpuCholesky(lower=True)
+    chol_f = theano.function([x], theano.tensor.grad(cholesky(x).sum(), [x]))
+    with assert_raises(LinAlgError):
+        chol_f(matrix)
+    # cholesky = GpuCholesky(lower=True, on_error='nan')
+    # chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
+    # assert np.all(np.isnan(chol_f(matrix)))
+def test_lower_triangular_and_cholesky_grad():
+    # Random lower triangular system is ill-conditioned.
+    #
+    # Reference
+    # -----------
+    # Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
+    # SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
+    #
+    # Use smaller number of N when using float32
+    if config.floatX == 'float64':
+        N = 100
+    else:
+        N = 5
+    rng = np.random.RandomState(utt.fetch_seed())
+    r = rng.randn(N, N).astype(config.floatX)
+    y = rng.rand(N, 1).astype(config.floatX)
+    def f(r, y):
+        PD = r.dot(r.T)
+        L = gpu_cholesky(PD)
+        A = gpu_solve_lower_triangular(L, y)
+        AAT = theano.tensor.dot(A, A.T)
+        B = AAT + theano.tensor.eye(N)
+        LB = gpu_cholesky(B)
+        return theano.tensor.sum(theano.tensor.log(theano.tensor.diag(LB)))
+    yield (lambda: utt.verify_grad(f, [r, y], 3, rng))