Add GpuTri so that L_op of GpuCublasTriangularSolve and GpuCholesky would fully run on GPU

615c255c · wonghang · 4bb986f9 · 615c255c · 615c255c · 615c255c
--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -1712,3 +1712,113 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,

    def c_code_cache_version(self):
        return (10,)
+
+class GpuTri(GpuKernelBase, Op):
+    """
+    Tri for GPU.
+
+    """
+    __props__ = ('dtype', 'context_name')
+    _f16_ok = True
+
+    def __init__(self, dtype=None, context_name=None):
+        if dtype is None:
+            dtype = config.floatX
+        self.dtype = dtype
+        self.context_name = context_name
+
+    def get_params(self, node):
+        return get_context(self.context_name)
+
+    def make_node(self, n, m, k):
+        n = tensor.as_tensor_variable(n)
+        m = tensor.as_tensor_variable(m)
+        k = tensor.as_tensor_variable(k)
+        assert n.ndim == 0
+        assert m.ndim == 0
+        assert k.ndim == 0
+        otype = GpuArrayType(dtype=self.dtype,
+                             broadcastable=(False, False),
+                             context_name=self.context_name)
+
+        return Apply(self, [n, m, k], [otype()])
+
+    def infer_shape(self, node, in_shapes):
+        out_shape = [node.inputs[0], node.inputs[1]]
+        return [out_shape]
+
+    def grad(self, inp, grads):
+        return [grad_undefined(self, i, inp[i])
+                for i in xrange(3)]
+
+    def gpu_kernels(self, node, name):
+        code = """#include "cluda.h"
+
+KERNEL void tri(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
+                ga_size n, ga_size m, ga_ssize k) {
+    a = (GLOBAL_MEM %(ctype)s *)(((GLOBAL_MEM char *)a) + a_off);
+    ga_ssize coff = max(k, (ga_ssize) 0);
+    ga_ssize roff = -min(k, (ga_ssize) 0);
+    for (ga_size i = LID_0; i < min(n - roff,n); i += LDIM_0) {
+        for (ga_size j = 0; j <= min(i + coff,m-1); j++) {
+          a[(i + roff)*m + j] = %(write_a)s(1);
+        }
+    }
+}""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype),
+            name=name, write_a=write_w(self.dtype))
+        return [Kernel(
+                code=code, name="tri",
+                params=[gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE,
+                        gpuarray.SIZE, gpuarray.SSIZE],
+                flags=Kernel.get_flags(self.dtype),
+                objvar='k_tri_' + name)]
+
+    def c_code(self, node, name, inp, out, sub):
+        if len(inp) == 2:
+            n, m = inp
+            k = 0
+        elif len(inp) == 3:
+            n, m, k = inp
+
+        z, = out
+        fail = sub['fail']
+        ctx = sub['params']
+        typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
+        kname = self.gpu_kernels(node, name)[0].objvar
+        s = """
+        size_t dims[2] = {0, 0};
+        size_t ls, gs;
+        ssize_t k;
+        int err;
+
+        dims[0] = ((dtype_%(n)s*)PyArray_DATA(%(n)s))[0];
+        dims[1] = ((dtype_%(m)s*)PyArray_DATA(%(m)s))[0];
+        k = ((dtype_%(k)s*)PyArray_DATA(%(k)s))[0];
+
+        Py_CLEAR(%(z)s);
+
+        %(z)s = pygpu_zeros(2, dims,
+                            %(typecode)s,
+                            GA_C_ORDER,
+                            %(ctx)s, Py_None);
+        if (%(z)s == NULL) {
+            %(fail)s
+        }
+
+        ls = 1;
+        gs = 256;
+        err = tri_call(1, &gs, &ls, 0, %(z)s->ga.data, %(z)s->ga.offset,
+                       dims[0], dims[1], k);
+        if (err != GA_NO_ERROR) {
+            PyErr_Format(PyExc_RuntimeError,
+                         "gpuarray error: kTri: %%s. n%%lu, m=%%lu.",
+                         GpuKernel_error(&%(kname)s, err),
+                         (unsigned long)dims[0], (unsigned long)dims[1]);
+            %(fail)s;
+        }
+        """ % locals()
+
+        return s
+
+    def c_code_cache_version(self):
+        return (10,)
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -420,13 +420,8 @@ class GpuCublasTriangularSolve(Op):
        trans_solve_op = GpuCublasTriangularSolve(not self.lower)
        b_bar = trans_solve_op(A.T, c_bar)

-        # FIXME: tensor.outer does not appear to use GPU
-        def gpu_outer(x,y):
-            return tensor.dot(x.dimshuffle(0,'x'),y.dimshuffle('x',0))
-
-        A_bar = -gpu_outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)
+        A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T)

-        # FIXME: tensor.tril / tensor.triu has no GPU implementation
        if self.lower:
            A_bar = tensor.tril(A_bar)
        else:
@@ -584,9 +579,6 @@ class GpuCholesky(Op):
            chol_x = chol_x.T
            dz = dz.T

-        # FIXME: tensor.tril / tensor.triu / tensor.diagonal / tensor.diag
-        # has no GPU implementation
-            
        def tril_and_halve_diagonal(mtx):
            """Extracts lower triangle of square matrix and halves diagonal."""
            return tensor.tril(mtx) - tensor.diag(tensor.diagonal(mtx) / 2.)

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -52,7 +52,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous, gpu_contiguous,
                        GpuAlloc, GpuAllocEmpty, GpuReshape,
-                        GpuEye, gpu_join, GpuJoin)
+                        GpuEye, GpuTri, gpu_join, GpuJoin)
 from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemm_inplace,
                   gpugemmbatch_no_inplace,
@@ -389,7 +389,8 @@ class GraphToGPU(Optimizer):
            if (not move_to_GPU and
                    isinstance(node.op, (theano.tensor.Alloc,
                                         theano.tensor.AllocEmpty,
-                                         theano.tensor.basic.Eye))):
+                                         theano.tensor.basic.Eye,
+                                         theano.tensor.basic.Tri))):
                # If the Alloc[Empty] have a client that will be moved
                # to the GPU, we should move the Alloc* on the GPU.

@@ -1411,6 +1412,11 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
 def local_gpua_eye(op, context_name, inputs, outputs):
    return GpuEye(dtype=op.dtype, context_name=context_name)

+@register_opt('fast_compile')
+@op_lifter([tensor.basic.Tri])
+@register_opt2([tensor.basic.Tri], 'fast_compile')
+def local_gpua_tri(op, context_name, inputs, outputs):
+    return GpuTri(dtype=op.dtype, context_name=context_name)

 @register_opt('fast_compile')
 @op_lifter([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])

--- a/theano/gpuarray/tests/test_basic_ops.py
+++ b/theano/gpuarray/tests/test_basic_ops.py
@@ -20,7 +20,8 @@ from ..type import (GpuArrayType, get_context,
 from ..basic_ops import (
    host_from_gpu, HostFromGpu, GpuFromHost, GpuReshape, GpuToGpu,
    GpuAlloc, GpuAllocEmpty, GpuContiguous,
-    gpu_join, GpuJoin, GpuSplit, GpuEye, gpu_contiguous)
+    gpu_join, GpuJoin, GpuSplit, GpuEye, GpuTri,
+    gpu_contiguous)
 from ..elemwise import GpuDimShuffle, GpuElemwise
 from ..subtensor import GpuSubtensor

@@ -443,7 +444,6 @@ def test_gpueye():
        yield check, dtype, 5, 3, 6
        yield check, dtype, 3, 5, -6

-
 def test_hostfromgpu_shape_i():
    # Test that the shape is lifted over hostfromgpu

@@ -497,3 +497,111 @@ def test_Gpujoin_inplace():
    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        assert x.get_value(borrow=True, return_internal_type=True) is f(0)
    assert np.allclose(f(0), [3, 4, 5])
+
+def test_gpu_tril_triu():
+    def check_l(m, k=0):
+        m_symb = T.matrix(dtype=m.dtype)
+        k_symb = T.iscalar()
+
+        f = theano.function([m_symb,k_symb],
+                            T.tril(m_symb,k_symb),
+                            mode=mode_with_gpu)
+        result = f(m, k)
+        assert np.allclose(result, np.tril(m, k))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+
+    def check_u(m, k=0):
+        m_symb = T.matrix(dtype=m.dtype)
+        k_symb = T.iscalar()
+        f = theano.function([m_symb,k_symb],
+                            T.triu(m_symb,k_symb),
+                            mode=mode_with_gpu)
+        result = f(m, k)
+        assert np.allclose(result, np.triu(m, k))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+
+    utt.seed_rng()
+    test_rng = np.random.RandomState(seed=utt.fetch_seed())
+
+    for dtype in ['float64', 'float32', 'float16']:
+        # try a big one
+        m = np.asarray(test_rng.rand(5000,5000) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+        
+        m = np.asarray(test_rng.rand(10,10) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+
+        m = np.asarray(test_rng.rand(10,5) * 2 - 1, dtype=dtype)
+        yield check_l, m, 0
+        yield check_l, m, 1
+        yield check_l, m, -1
+
+        yield check_u, m, 0
+        yield check_u, m, 1
+        yield check_u, m, -1
+
+def test_gputri():
+    def check(dtype, N, M_=None, k=0):
+        # Theano does not accept None as a tensor.
+        # So we must use a real value.
+        M = M_
+        # Currently DebugMode does not support None as inputs even if this is
+        # allowed.
+        if M is None:
+            M = N
+        N_symb = T.iscalar()
+        M_symb = T.iscalar()
+        k_symb = T.iscalar()
+        out = T.tri(N_symb, M_symb, k_symb, dtype=dtype) + np.array(1).astype(dtype)
+        f = theano.function([N_symb, M_symb, k_symb],
+                            out,
+                            mode=mode_with_gpu)
+        result = np.asarray(f(N, M, k)) - np.array(1).astype(dtype)
+        print(result)
+        assert np.allclose(result, np.tri(N, M_, k, dtype=dtype))
+        assert result.dtype == np.dtype(dtype)
+        assert any([isinstance(node.op, GpuTri)
+                    for node in f.maker.fgraph.toposort()])
+
+    for dtype in ['float64', 'float32', 'int32', 'float16']:
+        # try a big one
+        yield check, dtype, 1000, 1000, 0
+        yield check, dtype, 1000, 1000, -400
+        yield check, dtype, 1000, 1000, 400
+        
+        yield check, dtype, 5
+        # M != N, k = 0
+        yield check, dtype, 3, 5
+        yield check, dtype, 5, 3
+        # N == M, k != 0
+        yield check, dtype, 3, 3, 1
+        yield check, dtype, 3, 3, -1
+        # N < M, k != 0
+        yield check, dtype, 3, 5, 1
+        yield check, dtype, 3, 5, -1
+        # N > M, k != 0
+        yield check, dtype, 5, 3, 1
+        yield check, dtype, 5, 3, -1
+        # k > M, -k > N, k > M, k > N
+        yield check, dtype, 5, 3, 3
+        yield check, dtype, 3, 5, 3
+        yield check, dtype, 5, 3, -3
+        yield check, dtype, 3, 5, -3
+        yield check, dtype, 5, 3, 6
+        yield check, dtype, 3, 5, -6