Add magma cholesky decomposition

118c35e8 · Alexander Matyasko · 550b2d41 · 118c35e8 · 118c35e8 · 118c35e8
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -13,7 +13,8 @@ from theano.scalar import bool as bool_t
 from theano.gof import COp, ParamsType
 from theano.gpuarray import GpuArrayType
-from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
+from .basic_ops import (CGpuKernelBase, as_gpuarray_variable, gpu_contiguous,
+                        infer_context_name)
 from .type import gpu_context_type
 try:
@@ -518,3 +519,59 @@ def gpu_matrix_inverse(a):
    """
    return GpuMagmaMatrixInverse()(a)
+class GpuMagmaCholesky(CGpuKernelBase):
+    """Computes the cholesky decomposition of a matrix :math:`A` using magma
+    library.
+    """
+    params_type = gpu_context_type
+    __props__ = ('lower', 'inplace')
+    def __init__(self, lower=True, inplace=False):
+        self.lower = lower
+        COp.__init__(self, ['magma_cholesky.c'], 'APPLY_SPECIFIC(magma_cholesky)')
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def c_headers(self):
+        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
+                'gpuarray_helper.h', 'magma.h']
+    def c_header_dirs(self):
+        dirs = [os.path.dirname(__file__), pygpu.get_include()]
+        if config.magma.include_path:
+            dirs.append(config.magma.include_path)
+        return dirs
+    def c_libraries(self):
+        return ['magma']
+    def c_lib_dirs(self):
+        if config.magma.library_path:
+            return [config.magma.library_path]
+        return []
+    def make_node(self, A):
+        ctx_name = infer_context_name(A)
+        A = as_gpuarray_variable(A, ctx_name)
+        A = gpu_contiguous(A)
+        if A.ndim != 2:
+            raise LinAlgError("Matrix rank error")
+        return theano.Apply(self, [A], [A.type()])
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        params = []
+        if self.lower:
+            params.append(('LOWER', '1'))
+        if self.inplace:
+            params.append(('INPLACE', '1'))
+        return params
+    def infer_shape(self, node, shapes):
+        return [shapes[0]]
--- a/theano/gpuarray/magma_cholesky.c
+++ b/theano/gpuarray/magma_cholesky.c
+#section kernels
+#kernel tril_kernel : size, size, *:
+KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
+                        GLOBAL_MEM DTYPE_INPUT_0 *a) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    unsigned int ix = index / ncols;
+    unsigned int iy = index % ncols;
+    if (index < nthreads) {
+      if (ix < iy) {
+        a[index] = 0.0;
+      }
+    }
+  }
+}
+#kernel triu_kernel : size, size, *:
+KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
+                        GLOBAL_MEM DTYPE_INPUT_0 *a) {
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    unsigned int ix = index / ncols;
+    unsigned int iy = index % ncols;
+    if (index < nthreads) {
+      if (ix > iy) {
+        a[index] = 0.0;
+      }
+    }
+  }
+}
+#section init_code
+setup_ext_cuda();
+#section support_code_struct
+int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
+                                   PyGpuContextObject *c) {
+  const size_t *dims;
+  size_t N, n2;
+  magma_uplo_t ul;
+  int res = -1, info;
+  if (A->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaCholesky: unsupported data type");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  magma_init();
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaCholesky: requires data to be C-contiguous");
+    goto fail;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
+    goto fail;
+  }
+  dims = PyGpuArray_DIMS(A);
+  if (dims[0] != dims[1]) {
+    PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
+    goto fail;
+  }
+#ifdef INPLACE
+  Py_XDECREF(*L);
+  *L = A;
+  Py_INCREF(*L);
+#else
+  *L = theano_try_copy(*L, A);
+  if (*L == NULL) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "GpuMagmaCholesky: failed to allocate memory for the output");
+    goto fail;
+  }
+#endif
+  // magma matrix cholesky
+  N = dims[0];
+  n2 = N * N;
+// Magma requires column-major order for the matrix A. Instead of changing
+// matrix order which requires copying data, we can compute cholesky
+// decomposition where we change parameters lower to upper and upper to
+// lower.
+#ifdef LOWER
+  ul = MagmaUpper;
+#else
+  ul = MagmaLower;
+#endif
+  magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
+  if (info > 0) {
+    PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
+                                     "order %d is not positive definite",
+                 info);
+    goto fail;
+  } else if (info < 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
+        -info);
+    goto fail;
+  }
+#ifdef LOWER
+  res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.data);
+  if (res != GA_NO_ERROR) {
+    PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
+                 GpuKernel_error(&k_triu_kernel, res));
+    goto fail;
+  }
+#else
+  res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.data);
+  if (res != GA_NO_ERROR) {
+    PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
+                 GpuKernel_error(&k_triu_kernel, res));
+    goto fail;
+  }
+#endif
+  res = 0;
+fail:
+  magma_finalize();
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/tensor/nlinalg.py
+++ b/theano/tensor/nlinalg.py
@@ -559,8 +559,7 @@ class QRIncomplete(Op):
        (x,) = inputs
        (q,) = outputs
        assert x.ndim == 2, "The input of qr function should be a matrix."
-        q[0] = self._numop(x,
+        q[0] = self._numop(x, self.mode)
-                           self.mode)
 def qr(a, mode="reduced"):