Merge pull request #6653 from wonghang/potrf64_and_Lop

float64 support for GpuCholesky, GpuCusolverSolve, GpuCublasTriangularSolve and their L_op

Merge pull request #6653 from wonghang/potrf64_and_Lop
813faf6a · abergeron · GitHub · 21eb5367 · 174f119d · 813faf6a
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -1712,3 +1712,114 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,

    def c_code_cache_version(self):
        return (10,)
+
+
+class GpuTri(GpuKernelBase, Op):
+    """
+    Tri for GPU.
+
+    """
+    __props__ = ('dtype', 'context_name')
+    _f16_ok = True
+
+    def __init__(self, dtype=None, context_name=None):
+        if dtype is None:
+            dtype = config.floatX
+        self.dtype = dtype
+        self.context_name = context_name
+
+    def get_params(self, node):
+        return get_context(self.context_name)
+
+    def make_node(self, n, m, k):
+        n = tensor.as_tensor_variable(n)
+        m = tensor.as_tensor_variable(m)
+        k = tensor.as_tensor_variable(k)
+        assert n.ndim == 0
+        assert m.ndim == 0
+        assert k.ndim == 0
+        otype = GpuArrayType(dtype=self.dtype,
+                             broadcastable=(False, False),
+                             context_name=self.context_name)
+
+        return Apply(self, [n, m, k], [otype()])
+
+    def infer_shape(self, node, in_shapes):
+        out_shape = [node.inputs[0], node.inputs[1]]
+        return [out_shape]
+
+    def grad(self, inp, grads):
+        return [grad_undefined(self, i, inp[i])
+                for i in xrange(3)]
+
+    def gpu_kernels(self, node, name):
+        code = """#include "cluda.h"
+
+KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
+                ga_size n, ga_size m, ga_ssize k) {
+    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
+    ga_ssize coff = max(k, (ga_ssize) 0);
+    ga_ssize roff = -min(k, (ga_ssize) 0);
+    for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
+        for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
+          a[(i + roff)*m + j] = %(write_a)s(1);
+        }
+    }
+}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
+            name=name, write_a=write_w(self.dtype))
+        return [Kernel(
+                code=code, name="tri",
+                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
+                        gpuarray.SIZE, gpuarray.SSIZE],
+                flags=Kernel.get_flags(self.dtype),
+                objvar='k_tri_' + name)]
+
+    def c_code(self, node, name, inp, out, sub):
+        if len(inp) == 2:
+            n, m = inp
+            k = 0
+        elif len(inp) == 3:
+            n, m, k = inp
+
+        z, = out
+        fail = sub['fail']
+        ctx = sub['params']
+        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+        size_t dims[2] = {0, 0};
+        size_t ls, gs;
+        ssize_t k;
+        int err;
+
+        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
+        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
+        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
+
+        Py_CLEAR(%(z)s);
+
+        %(z)s = pygpu_zeros(2, dims,
+                            %(typecode)s,
+                            GA_C_ORDER,
+                            %(ctx)s, Py_None);
+        if (%(z)s == NULL) {
+            %(fail)s
+        }
+
+        ls = 1;
+        gs = 256;
+        err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
+                       dims[0], dims[1], k);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
+                         GpuKernel_error(&%(kname)s, err),
+                         (unsigned long)dims[0], (unsigned long)dims[1]);
+            %(fail)s;
+        }
+        """ % locals()
+
+        return s
+
+    def c_code_cache_version(self):
+        return (1,)
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous, gpu_contiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
-                        GpuEye, gpu_join, GpuJoin)
+                        GpuEye, GpuTri, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemm_inplace,
                   gpugemmbatch_no_inplace,
@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer):
            if (not move_to_GPU and
                    isinstance(node.op, (theano.tensor.Alloc,
                                         theano.tensor.AllocEmpty,
-                                         theano.tensor.basic.Eye))):
+                                         theano.tensor.basic.Eye,
+                                         theano.tensor.basic.Tri))):
                # If the Alloc[Empty] have a client that will be moved
                # to the GPU, we should move the Alloc* on the GPU.

@@ -1412,6 +1413,13 @@ def local_gpua_eye(op, context_name, inputs, outputs):
    return GpuEye(dtype=op.dtype, context_name=context_name)


+@register_opt('fast_compile')
+@op_lifter([tensor.basic.Tri])
+@register_opt2([tensor.basic.Tri], 'fast_compile')
+def local_gpua_tri(op, context_name, inputs, outputs):
+    return GpuTri(dtype=op.dtype, context_name=context_name)
+
+
 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
 @register_opt2([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias], 'fast_compile')
@@ -2583,7 +2591,7 @@ def local_gpua_images2neibs(op, context_name, inputs, outputs):
 @op_lifter([slinalg.Solve])
 @register_opt2([theano.tensor.slinalg.Solve], 'fast_compile')
 def local_gpu_solve(op, context_name, inputs, outputs):
-    if inputs[0].dtype not in ['float16', 'float32']:
+    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
        return
    if op.A_structure not in MATRIX_STRUCTURES_SOLVE:
        return
@@ -2609,7 +2617,8 @@ def local_gpu_solve(op, context_name, inputs, outputs):
 def local_inplace_gpu_solve(node):
    if isinstance(node.op, GpuCusolverSolve) and not node.op.inplace:
        with inherit_stack_trace(node.outputs):
-            return [GpuCusolverSolve(A_structure=node.op.A_structure, trans=node.op.trans,
+            return [GpuCusolverSolve(A_structure=node.op.A_structure,
+                                     trans=node.op.trans,
                                     inplace=True)(*node.inputs)]


@@ -2617,7 +2626,7 @@ def local_inplace_gpu_solve(node):
 def local_gpu_cholesky(op, context_name, inputs, outputs):
    if not cusolver_available:
        return
-    if inputs[0].dtype not in ['float16', 'float32']:
+    if inputs[0].dtype not in ['float16', 'float32', 'float64']:
        return
    op = GpuCholesky(lower=op.lower, inplace=op.destructive)
    if inputs[0].dtype == 'float16':

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context,
 from ..basic_ops import (
    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
    GpuAlloc, GpuAllocEmpty, GpuContiguous,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+    gpu_join, GpuJoin, GpuSplit, GpuEye, GpuTri,
+    gpu_contiguous)
 from ..elemwise import GpuDimShuffle, GpuElemwise
 from ..subtensor import GpuSubtensor

@@ -497,3 +498,112 @@ def test_Gpujoin_inplace():
    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        assert x.get_value(borrow=True, return_internal_type=True) is f(0)
    assert np.allclose(f(0), [3, 4, 5])
+
+
+def test_gpu_tril_triu():
+    def check_l(m, k=0):
+        m_symb = T.matrix(dtype=m.dtype)
+        k_symb = T.iscalar()
+
+        f = theano.function([m_symb, k_symb],
+                            T.tril(m_symb, k_symb),
+                            mode=mode_with_gpu)
+        result = f(m, k)
+        assert np.allclose(result, np.tril(m, k))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+
+    def check_u(m, k=0):
+        m_symb = T.matrix(dtype=m.dtype)
+        k_symb = T.iscalar()
+        f = theano.function([m_symb, k_symb],
+                            T.triu(m_symb, k_symb),
+                            mode=mode_with_gpu)
+        result = f(m, k)
+        assert np.allclose(result, np.triu(m, k))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+
+    utt.seed_rng()
+    test_rng = np.random.RandomState(seed=utt.fetch_seed())
+
+    for dtype in ['float64', 'float32', 'float16']:
+        # try a big one
+        m = np.asarray(test_rng.rand(5000, 5000) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+
+        m = np.asarray(test_rng.rand(10, 10) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+
+        m = np.asarray(test_rng.rand(10, 5) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+
+
+def test_gputri():
+    def check(dtype, N, M_=None, k=0):
+        # Theano does not accept None as a tensor.
+        # So we must use a real value.
+        M = M_
+        # Currently DebugMode does not support None as inputs even if this is
+        # allowed.
+        if M is None:
+            M = N
+        N_symb = T.iscalar()
+        M_symb = T.iscalar()
+        k_symb = T.iscalar()
+        out = T.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
+        f = theano.function([N_symb, M_symb, k_symb],
+                            out,
+                            mode=mode_with_gpu)
+        result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
+        assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+
+    for dtype in ['float64', 'float32', 'int32', 'float16']:
+        # try a big one
+        yield check, dtype, 1000, 1000, 0
+        yield check, dtype, 1000, 1000, -400
+        yield check, dtype, 1000, 1000, 400
+
+        yield check, dtype, 5
+        # M != N, k = 0
+        yield check, dtype, 3, 5
+        yield check, dtype, 5, 3
+        # N == M, k != 0
+        yield check, dtype, 3, 3, 1
+        yield check, dtype, 3, 3, -1
+        # N < M, k != 0
+        yield check, dtype, 3, 5, 1
+        yield check, dtype, 3, 5, -1
+        # N > M, k != 0
+        yield check, dtype, 5, 3, 1
+        yield check, dtype, 5, 3, -1
+        # k > M, -k > N, k > M, k > N
+        yield check, dtype, 5, 3, 3
+        yield check, dtype, 3, 5, 3
+        yield check, dtype, 5, 3, -3
+        yield check, dtype, 3, 5, -3
+        yield check, dtype, 5, 3, 6
+        yield check, dtype, 3, 5, -6
--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -7,11 +7,14 @@ from numpy.linalg.linalg import LinAlgError

 import theano
 from theano import config
-from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
+from theano.gpuarray.linalg import (GpuCusolverSolve, GpuCublasTriangularSolve,
+                                    GpuCholesky, GpuMagmaCholesky,
                                    GpuMagmaEigh, GpuMagmaMatrixInverse,
                                    GpuMagmaQR, GpuMagmaSVD,
                                    cusolver_available, gpu_matrix_inverse,
-                                    gpu_solve, gpu_svd, gpu_qr)
+                                    gpu_cholesky,
+                                    gpu_solve, gpu_solve_lower_triangular,
+                                    gpu_svd, gpu_qr)
 from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
                                   QRIncomplete, eigh, matrix_inverse, qr)
 from theano.tensor.slinalg import Cholesky, cholesky, imported_scipy
@@ -20,6 +23,7 @@ from theano.tests import unittest_tools as utt
 from .. import gpuarray_shared_constructor
 from .config import mode_with_gpu, mode_without_gpu
 from .test_basic_ops import rand
+from nose.tools import assert_raises


 class TestCusolver(unittest.TestCase):
@@ -122,6 +126,41 @@ class TestCusolver(unittest.TestCase):
        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
        self.assertRaises(LinAlgError, fn, A_val, x_val)

+    def verify_solve_grad(self, m, n, A_structure, lower, rng):
+        # ensure diagonal elements of A relatively large to avoid numerical
+        # precision issues
+        A_val = (rng.normal(size=(m, m)) * 0.5 +
+                 np.eye(m)).astype(config.floatX)
+        if A_structure == 'lower_triangular':
+            A_val = np.tril(A_val)
+        elif A_structure == 'upper_triangular':
+            A_val = np.triu(A_val)
+        if n is None:
+            b_val = rng.normal(size=m).astype(config.floatX)
+        else:
+            b_val = rng.normal(size=(m, n)).astype(config.floatX)
+        eps = None
+        if config.floatX == "float64":
+            eps = 2e-8
+
+        if A_structure in ('lower_triangular', 'upper_triangular'):
+            solve_op = GpuCublasTriangularSolve(lower=lower)
+        else:
+            solve_op = GpuCusolverSolve(A_structure="general")
+        utt.verify_grad(solve_op, [A_val, b_val], 3, rng, eps=eps)
+
+    def test_solve_grad(self):
+        rng = np.random.RandomState(utt.fetch_seed())
+        structures = ['general', 'lower_triangular', 'upper_triangular']
+        for A_structure in structures:
+            lower = (A_structure == 'lower_triangular')
+            # self.verify_solve_grad(5, None, A_structure, lower, rng)
+            self.verify_solve_grad(6, 1, A_structure, lower, rng)
+            self.verify_solve_grad(4, 3, A_structure, lower, rng)
+        # lower should have no effect for A_structure == 'general' so also
+        # check lower=True case
+        self.verify_solve_grad(4, 3, 'general', lower=True, rng=rng)
+

 class TestGpuCholesky(unittest.TestCase):

@@ -215,6 +254,98 @@ class TestGpuCholesky(unittest.TestCase):
        self.assertRaises(LinAlgError, fn, A_val)


+class TestGpuCholesky64(unittest.TestCase):
+
+    def setUp(self):
+        if not cusolver_available:
+            self.skipTest('Optional package scikits.cuda.cusolver not available')
+        utt.seed_rng()
+
+    def get_gpu_cholesky_func(self, lower=True, inplace=False):
+        # Helper function to compile function from GPU Cholesky op.
+        A = theano.tensor.matrix("A", dtype="float64")
+        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
+        chol_A = cholesky_op(A)
+        return theano.function([A], chol_A, accept_inplace=inplace,
+                               mode=mode_with_gpu)
+
+    def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
+        # Helper function to compare op output to np.cholesky output.
+        chol_A_val = np.linalg.cholesky(A_val)
+        if not lower:
+            chol_A_val = chol_A_val.T
+        fn = self.get_gpu_cholesky_func(lower, inplace)
+        res = fn(A_val)
+        chol_A_res = np.array(res)
+        utt.assert_allclose(chol_A_res, chol_A_val)
+
+    def test_gpu_cholesky_opt(self):
+        if not imported_scipy:
+            self.skipTest('SciPy is not enabled, skipping test')
+        A = theano.tensor.matrix("A", dtype="float64")
+        fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
+        assert any([isinstance(node.op, GpuCholesky)
+                    for node in fn.maker.fgraph.toposort()])
+
+    def test_invalid_input_fail_non_square(self):
+        # Invalid Cholesky input test with non-square matrix as input.
+        A_val = np.random.normal(size=(3, 2)).astype("float64")
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(ValueError, fn, A_val)
+
+    def test_invalid_input_fail_vector(self):
+        # Invalid Cholesky input test with vector as input.
+        def invalid_input_func():
+            A = theano.tensor.vector("A", dtype="float64")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+
+    def test_invalid_input_fail_tensor3(self):
+        # Invalid Cholesky input test with 3D tensor as input.
+        def invalid_input_func():
+            A = theano.tensor.tensor3("A", dtype="float64")
+            GpuCholesky(lower=True, inplace=False)(A)
+        self.assertRaises(AssertionError, invalid_input_func)
+
+    @utt.assertFailure_fast
+    def test_diag_chol(self):
+        # Diagonal matrix input Cholesky test.
+        for lower in [True, False]:
+            for inplace in [True, False]:
+                # make sure all diagonal elements are positive so positive-definite
+                A_val = np.diag(np.random.uniform(size=5).astype("float64") + 1)
+                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
+
+    @utt.assertFailure_fast
+    def test_dense_chol_lower(self):
+        # Dense matrix input lower-triangular Cholesky test.
+        for lower in [True, False]:
+            for inplace in [True, False]:
+                M_val = np.random.normal(size=(3, 3)).astype("float64")
+                # A = M.dot(M) will be positive definite for all non-singular M
+                A_val = M_val.dot(M_val.T)
+                self.compare_gpu_cholesky_to_np(A_val, lower=lower, inplace=inplace)
+
+    def test_invalid_input_fail_non_symmetric(self):
+        # Invalid Cholesky input test with non-symmetric input.
+        #    (Non-symmetric real input must also be non-positive definite).
+        A_val = None
+        while True:
+            A_val = np.random.normal(size=(3, 3)).astype("float64")
+            if not np.allclose(A_val, A_val.T):
+                break
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(LinAlgError, fn, A_val)
+
+    def test_invalid_input_fail_negative_definite(self):
+        # Invalid Cholesky input test with negative-definite input.
+        M_val = np.random.normal(size=(3, 3)).astype("float64")
+        # A = -M.dot(M) will be negative definite for all non-singular M
+        A_val = -M_val.dot(M_val.T)
+        fn = self.get_gpu_cholesky_func(True, False)
+        self.assertRaises(LinAlgError, fn, A_val)
+
+
 class TestMagma(unittest.TestCase):

    def setUp(self):
@@ -467,3 +598,61 @@ class TestMagma(unittest.TestCase):
            isinstance(node.op, GpuMagmaEigh)
            for node in fn.maker.fgraph.toposort()
        ])
+
+
+# mostly copied from theano/tensor/tests/test_slinalg.py
+def test_cholesky_grad():
+    rng = np.random.RandomState(utt.fetch_seed())
+    r = rng.randn(5, 5).astype(config.floatX)
+
+    # The dots are inside the graph since Cholesky needs separable matrices
+
+    # Check the default.
+    yield (lambda: utt.verify_grad(lambda r: gpu_cholesky(r.dot(r.T)),
+                                   [r], 3, rng))
+    # Explicit lower-triangular.
+    yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=True)(r.dot(r.T)),
+                                   [r], 3, rng))
+    # Explicit upper-triangular.
+    yield (lambda: utt.verify_grad(lambda r: GpuCholesky(lower=False)(r.dot(r.T)),
+                                   [r], 3, rng))
+
+
+def test_cholesky_grad_indef():
+    x = theano.tensor.matrix()
+    matrix = np.array([[1, 0.2], [0.2, -2]]).astype(config.floatX)
+    cholesky = GpuCholesky(lower=True)
+    chol_f = theano.function([x], theano.tensor.grad(cholesky(x).sum(), [x]))
+    with assert_raises(LinAlgError):
+        chol_f(matrix)
+    # cholesky = GpuCholesky(lower=True, on_error='nan')
+    # chol_f = function([x], grad(gpu_cholesky(x).sum(), [x]))
+    # assert np.all(np.isnan(chol_f(matrix)))
+
+
+def test_lower_triangular_and_cholesky_grad():
+    # Random lower triangular system is ill-conditioned.
+    #
+    # Reference
+    # -----------
+    # Viswanath, Divakar, and L. N. Trefethen. "Condition numbers of random triangular matrices."
+    # SIAM Journal on Matrix Analysis and Applications 19.2 (1998): 564-581.
+    #
+    # Use smaller number of N when using float32
+    if config.floatX == 'float64':
+        N = 100
+    else:
+        N = 5
+    rng = np.random.RandomState(utt.fetch_seed())
+    r = rng.randn(N, N).astype(config.floatX)
+    y = rng.rand(N, 1).astype(config.floatX)
+
+    def f(r, y):
+        PD = r.dot(r.T)
+        L = gpu_cholesky(PD)
+        A = gpu_solve_lower_triangular(L, y)
+        AAT = theano.tensor.dot(A, A.T)
+        B = AAT + theano.tensor.eye(N)
+        LB = gpu_cholesky(B)
+        return theano.tensor.sum(theano.tensor.log(theano.tensor.diag(LB)))
+    yield (lambda: utt.verify_grad(f, [r, y], 3, rng))