now using a cuda kernel through GpuKernel for triu/tril

8289fc6b · Thomas George · 9d8c72f8 · 8289fc6b · 8289fc6b
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -5,8 +5,12 @@ import theano
 import warnings
 from theano import Op
 from theano.gpuarray import basic_ops, GpuArrayType
+from pygpu.gpuarray import GpuKernel, GpuArray
+from string import Template
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@@ -19,7 +23,6 @@ cusolver_available = False
 try:
    import skcuda
    from skcuda import cusolver
-    from skcuda import linalg
    cusolver_available = True
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass
@@ -53,13 +56,51 @@ if cusolver_available:
        cusolver.cusolverCheckStatus(status)
-def attach_handle_to_context(ctx):
+def attach_cusolver_handle_to_context(ctx):
    handle = getattr(ctx, 'cusolver_handle', None)
    if handle is None:
        with ctx:
            ctx.cusolver_handle = cusolver.cusolverDnCreate()
+def tril(A, ctx):
+    tmpl = Template("""
+    KERNEL void tril(GLOBAL_MEM ga_float *a, ga_uint N) {
+        unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+
+                           blockIdx.x*blockDim.x+threadIdx.x;
+        unsigned int ix = idx/${cols};
+        unsigned int iy = idx%${cols};
+        if (idx < N) {
+            if (ix < iy)
+                a[idx] = 0.0;
+        }
+    }
+    """)
+    src = tmpl.substitute(cols=A.shape[0])
+    spec = [GpuArray, 'uint32']
+    k = GpuKernel(src, "tril", spec, context=ctx)
+    return k(A, A.shape[0] * A.shape[1], n=A.shape[0])
+def triu(A, ctx):
+    tmpl = Template("""
+    KERNEL void triu(GLOBAL_MEM ga_float *a, ga_uint N) {
+        unsigned int idx = blockIdx.y*blockDim.x*gridDim.x+
+                           blockIdx.x*blockDim.x+threadIdx.x;
+        unsigned int ix = idx/${cols};
+        unsigned int iy = idx%${cols};
+        if (idx < N) {
+            if (ix > iy)
+                a[idx] = 0.0;
+        }
+    }
+    """)
+    src = tmpl.substitute(cols=A.shape[0])
+    spec = [GpuArray, 'uint32']
+    k = GpuKernel(src, "triu", spec, context=ctx)
+    return k(A, A.shape[0] * A.shape[1], n=A.shape[0])
 class GpuCusolverSolve(Op):
    """
    CUSOLVER GPU solver OP.
@@ -109,7 +150,7 @@ class GpuCusolverSolve(Op):
    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
-        attach_handle_to_context(ctx)
+        attach_cusolver_handle_to_context(ctx)
    def check_dev_info(self, dev_info):
        val = np.asarray(dev_info)[0]
@@ -268,7 +309,7 @@ class GpuCholesky(Op):
    def prepare_node(self, node, storage_map, compute_map, impl):
        ctx = node.inputs[0].type.context
-        attach_handle_to_context(ctx)
+        attach_cusolver_handle_to_context(ctx)
    def perform(self, node, inputs, outputs):
        context = inputs[0][0].context
@@ -306,32 +347,29 @@ class GpuCholesky(Op):
            workspace_size = cusolver.cusolverDnSpotrf_bufferSize(
                context.cusolver_handle, l_parameter, n, L_ptr, lda)
-        workspace = pygpu.zeros(workspace_size, dtype='float32',
+            workspace = pygpu.zeros(workspace_size, dtype='float32',
-                                context=context)
+                                    context=context)
-        dev_info = pygpu.zeros((1,), dtype='int32', context=context)
+            dev_info = pygpu.zeros((1,), dtype='int32', context=context)
-        workspace_ptr = workspace.gpudata
+            workspace_ptr = workspace.gpudata
-        dev_info_ptr = dev_info.gpudata
+            dev_info_ptr = dev_info.gpudata
-        with context:
            cusolver.cusolverDnSpotrf(
                context.cusolver_handle, l_parameter, n, L_ptr, lda, workspace_ptr,
                workspace_size, dev_info_ptr)
+            val_dev_info = np.asarray(dev_info)[0]
+            if val_dev_info > 0:
+                raise LinAlgError('Cholesky decomposition failed (is A SPD?)')
        # cusolver leaves the elements in the matrix outside the considered
        # upper or lower triangle unchanged, so we need to put zeros outside
        # the triangle
-        """
+        # Note : we should probably check for c or f order in triu instead of here
-        with context:
+        if self.lower and L.flags['C_CONTIGUOUS'] or (not self.lower and L.flags['F_CONTIGUOUS']):
-            if self.lower:
+            tril(L, context)
-                linalg.tril(L, overwrite=True, handle=context.cudnn_handle)
-            else:
-                linalg.triu(L, overwrite=True, handle=context.cudnn_handle)
-        """
-        if self.lower:
-            L.write(numpy.tril(L))
        else:
-            L.write(numpy.triu(L))
+            triu(L, context)
        outputs[0][0] = L
--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -196,11 +196,7 @@ class TestGpuCholesky(unittest.TestCase):
        A_val = M_val.dot(M_val.T)
        self.compare_gpu_cholesky_to_numpy(A_val, lower=False, inplace=True)
-class nothing:
    def test_invalid_input_fail_non_symmetric(self):
-        pass
        """ Invalid Cholesky input test with non-symmetric input.
            (Non-symmetric real input must also be non-positive definite). """
        A_val = numpy.random.normal(size=(3, 3)).astype("float32")
@@ -208,14 +204,13 @@ class nothing:
        # not being the case even with finite precision should be negligible
        assert not numpy.allclose(A_val, A_val.T)
        fn = self.get_gpu_cholesky_func(True, False)
-        self.assertRaises(cula.cula.culaError, fn, A_val)
+        self.assertRaises(LinAlgError, fn, A_val)
    def test_invalid_input_fail_negative_definite(self):
-        pass
        """ Invalid Cholesky input test with negative-definite input. """
        M_val = numpy.random.normal(size=(3, 3)).astype("float32")
        # A = -M.dot(M) will be negative definite for all non-singular M
        A_val = -M_val.dot(M_val.T)
        fn = self.get_gpu_cholesky_func(True, False)
-        self.assertRaises(cula.cula.culaError, fn, A_val)
+        self.assertRaises(LinAlgError, fn, A_val)