Merge pull request #5914 from aam-at/magma_ops

Magma gpu QR/Cholesky/Eigh

Merge pull request #5914 from aam-at/magma_ops
e76d05d6 · abergeron · GitHub · ebaef5af · 21f17ada · e76d05d6
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
--- a/theano/gpuarray/magma_cholesky.c
+++ b/theano/gpuarray/magma_cholesky.c
+#section kernels
+#kernel tril_kernel : size, size, size, *:
+KERNEL void tril_kernel(const ga_size nthreads, const ga_size ncols,
+                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
+  a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    unsigned int ix = index / ncols;
+    unsigned int iy = index % ncols;
+    if (ix < iy) {
+      a[index] = 0.0;
+    }
+  }
+}
+#kernel triu_kernel : size, size, size, *:
+KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
+                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
+  a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    unsigned int ix = index / ncols;
+    unsigned int iy = index % ncols;
+    if (ix > iy) {
+      a[index] = 0.0;
+    }
+  }
+}
+#section init_code
+setup_ext_cuda();
+#section support_code_struct
+int APPLY_SPECIFIC(magma_cholesky)(PyGpuArrayObject *A, PyGpuArrayObject **L,
+                                   PARAMS_TYPE *params) {
+  const size_t *dims;
+  size_t N, n2;
+  magma_uplo_t ul;
+  int res = -1, info;
+  if (A->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaCholesky: unsupported data type");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(params->context->ctx);
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaCholesky: requires data to be C-contiguous");
+    goto fail;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix rank error");
+    goto fail;
+  }
+  dims = PyGpuArray_DIMS(A);
+  if (dims[0] != dims[1]) {
+    PyErr_SetString(PyExc_ValueError, "GpuMagmaCholesky: matrix is not square");
+    goto fail;
+  }
+  if (params->inplace) {
+    Py_XDECREF(*L);
+    *L = A;
+    Py_INCREF(*L);
+  } else {
+    *L = theano_try_copy(*L, A);
+    if (*L == NULL) {
+      PyErr_SetString(
+          PyExc_RuntimeError,
+          "GpuMagmaCholesky: failed to allocate memory for the output");
+      goto fail;
+    }
+  }
+  // magma matrix cholesky
+  N = dims[0];
+  n2 = N * N;
+// Magma requires column-major order for the matrix A. Instead of changing
+// matrix order which requires copying data, we can compute cholesky
+// decomposition where we change parameters lower to upper and upper to
+// lower.
+  if (params->lower) {
+    ul = MagmaUpper;
+  }
+  else {
+    ul = MagmaLower;
+  }
+  magma_spotrf_gpu(ul, N, (float *)PyGpuArray_DEV_DATA(*L), N, &info);
+  if (info > 0) {
+    PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: the leading minor of "
+                                     "order %d is not positive definite",
+                 info);
+    goto fail;
+  } else if (info < 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaCholesky: magma_spotrf_gpu argument %d has an illegal value",
+        -info);
+    goto fail;
+  }
+  if (params->lower) {
+    res = tril_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
+    if (res != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: tril_kernel %s.",
+                   GpuKernel_error(&k_tril_kernel, res));
+      goto fail;
+    }
+  } else {
+    res = triu_kernel_scall(1, &n2, 0, n2, N, (*L)->ga.offset, (*L)->ga.data);
+    if (res != GA_NO_ERROR) {
+      PyErr_Format(PyExc_RuntimeError, "GpuMagmaCholesky: triu_kernel %s.",
+                   GpuKernel_error(&k_triu_kernel, res));
+      goto fail;
+    }
+  }
+  res = 0;
+fail:
+  cuda_exit(params->context->ctx);
+  return res;
+}
--- a/theano/gpuarray/magma_eigh.c
+++ b/theano/gpuarray/magma_eigh.c
+#section init_code
+setup_ext_cuda();
+#section support_code_struct
+int APPLY_SPECIFIC(magma_eigh)(PyGpuArrayObject *A_,
+                               PyGpuArrayObject **D,
+                               PyGpuArrayObject **V, // may be NULL
+                               PARAMS_TYPE *params) {
+  PyGpuArrayObject *A = NULL;
+  magma_int_t N, liwork, *iwork_data = NULL;
+  size_t d_dims[1], v_dims[2];
+  magma_uplo_t uplo;
+  magma_vec_t jobz;
+  float *w_data = NULL, *wA_data = NULL, *work_data = NULL, lwork;
+  int res = -1, info;
+  if (A_->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaEigh: Unsupported data type");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(params->context->ctx);
+  if (!GpuArray_IS_C_CONTIGUOUS(&A_->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaEigh: requires data to be C-contiguous");
+    goto fail;
+  }
+  if (PyGpuArray_NDIM(A_) != 2) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaEigh: matrix rank error");
+    goto fail;
+  }
+  if (PyGpuArray_DIM(A_, 0) != PyGpuArray_DIM(A_, 1)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaEigh: matrix is not square");
+    goto fail;
+  }
+  A = pygpu_copy(A_, GA_F_ORDER);
+  if (A == NULL) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaEigh: failed to change to column-major order");
+    return -1;
+  }
+  // magma matrix eigen decomposition of a symmetric matrix
+  N = PyGpuArray_DIM(A, 0);
+  if (params->lower) {
+    uplo = MagmaLower;
+  } else {
+    uplo = MagmaUpper;
+  }
+  if (params->compute_v) {
+    jobz = MagmaVec;
+  } else {
+    jobz = MagmaNoVec;
+  }
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&w_data, N)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaEigh: failed to allocate working memory");
+    goto fail;
+  }
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&wA_data, N * N)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaEigh: failed to allocate working memory");
+    goto fail;
+  }
+  // query for workspace size
+  magma_ssyevd_gpu(jobz, uplo, N, NULL, N, NULL, NULL, N, &lwork,
+                   -1, &liwork, -1, &info);
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work_data, (size_t)lwork)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaEigh: failed to allocate working memory");
+    goto fail;
+  }
+  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork_data, liwork)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaEigh: failed to allocate working memory");
+    goto fail;
+  }
+  magma_ssyevd_gpu(jobz, uplo, N, (float *)PyGpuArray_DEV_DATA(A), N, w_data,
+                   wA_data, N, work_data, (size_t)lwork, iwork_data, liwork,
+                   &info);
+  if (info > 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaEigh:  %d off-diagonal elements of an didn't converge to zero",
+        info);
+    goto fail;
+  } else if (info < 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaEigh: magma_ssyevd_gpu argument %d has an illegal value", -info);
+    goto fail;
+  }
+  d_dims[0] = N;
+  if (theano_prep_output(D, 1, d_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaEigh: failed to allocate memory for the output");
+    goto fail;
+  }
+  cudaMemcpy(PyGpuArray_DEV_DATA(*D), w_data, N * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+  if (params->compute_v) {
+    *V = theano_try_copy(*V, A);
+    if (*V == NULL) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaEigh: failed to allocate memory for the output");
+      goto fail;
+    }
+  }
+  res = 0;
+fail:
+  if (w_data != NULL)
+    magma_free_pinned(w_data);
+  if (wA_data != NULL)
+    magma_free_pinned(wA_data);
+  if (work_data != NULL)
+    magma_free_pinned(work_data);
+  if (iwork_data != NULL)
+    magma_free_cpu(iwork_data);
+  Py_XDECREF(A);
+  cuda_exit(params->context->ctx);
+  return res;
+}
--- a/theano/gpuarray/magma_inv.c
+++ b/theano/gpuarray/magma_inv.c
@@ -5,7 +5,7 @@ setup_ext_cuda();
 #section support_code_struct
 int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
-                              PARAMS_TYPE* params) {
+                              PARAMS_TYPE *params) {
  const size_t *dims;
  magma_int_t N, ldwork, info;
  magma_int_t *piv = NULL;
@@ -20,12 +20,11 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
  // This is early to match the exit() in the fail label.
  cuda_enter(params->context->ctx);
-  magma_init();
  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
    PyErr_SetString(PyExc_ValueError,
                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
-    return 1;
+    goto fail;
  }
  if (PyGpuArray_NDIM(A) != 2) {
    PyErr_SetString(PyExc_ValueError,
@@ -93,7 +92,6 @@ fail:
    magma_free(piv);
  if (dwork != NULL)
    gpudata_release(dwork);
-  magma_finalize();
  cuda_exit(params->context->ctx);
  return res;
 }
--- a/theano/gpuarray/magma_qr.c
+++ b/theano/gpuarray/magma_qr.c
+#section kernels
+#kernel triu_kernel : size, size, size, *:
+KERNEL void triu_kernel(const ga_size nthreads, const ga_size ncols,
+                        const ga_size a_off, GLOBAL_MEM DTYPE_INPUT_0 *a) {
+  a = (GLOBAL_MEM DTYPE_INPUT_0 *)(((char *)a) + a_off);
+  // grid stride looping
+  for (ga_size index = GID_0 * LDIM_0 + LID_0; index < nthreads;
+       index += LDIM_0 * GDIM_0) {
+    const ga_size ix = index / ncols;
+    const ga_size iy = index % ncols;
+    if (ix > iy) {
+      a[index] = 0.0;
+    }
+  }
+}
+#section init_code
+setup_ext_cuda();
+#section support_code
+static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
+                                      size_t size) {
+  PyGpuArrayObject *src_view = pygpu_view(src, Py_None);
+  src_view->ga.dimensions[dim] = size;
+  GpuArray_fix_flags(&src_view->ga);
+  return pygpu_copy(src_view, GA_C_ORDER);
+}
+#section support_code_struct
+int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
+                             PyGpuArrayObject **R,
+                             PyGpuArrayObject **Q, // may be NULL
+                             PARAMS_TYPE* params) {
+  PyGpuArrayObject *A = NULL;
+  magma_int_t M, N, K, nb, ldwork;
+  size_t n2;
+  float *tau_data = NULL;
+  gpudata *work_data = NULL;
+  int res = -1, info;
+  A = A_;
+  if (A->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaQR: Unsupported data type");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(params->context->ctx);
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaQR: requires data to be C-contiguous");
+    goto fail;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
+    goto fail;
+  }
+  A = pygpu_copy(A_, GA_F_ORDER);
+  if (A == NULL) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaQR: failed to change to column-major order");
+    goto fail;
+  }
+  // magma matrix qr
+  M = PyGpuArray_DIM(A, 0);
+  N = PyGpuArray_DIM(A, 1);
+  K = M < N ? M : N;
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&tau_data, N * N)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaQR: failed to allocate working memory");
+    goto fail;
+  }
+  nb = magma_get_sgeqrf_nb(M, N);
+  ldwork = (2 * K + magma_roundup(N, 32)) * nb;
+  work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
+  if (work_data == NULL) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaQR: failed to allocate working memory");
+    goto fail;
+  }
+  // compute R
+  magma_sgeqrf2_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data, &info);
+  if (info != 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaQR: magma_sgeqrf2_gpu argument %d has an illegal value", -info);
+    goto fail;
+  }
+  *R = pygpu_narrow(A, 0, K);
+  if (*R == NULL) {
+    PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
+    goto fail;
+  }
+  n2 = K * N;
+  res = triu_kernel_scall(1, &n2, 0, n2, N, (*R)->ga.offset, (*R)->ga.data);
+  if (res != GA_NO_ERROR) {
+    PyErr_Format(PyExc_RuntimeError, "GpuMagmaQR: triu_kernel %s.",
+                 GpuKernel_error(&k_triu_kernel, res));
+    goto fail;
+  }
+  if (params->complete) {
+    // compute Q
+    Py_XDECREF(A);
+    A = pygpu_copy(A_, GA_F_ORDER);
+    if (A == NULL) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaQR: failed to change to column-major order");
+      return -1;
+    }
+    magma_sgeqrf_gpu(M, N, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
+                     *(float **)work_data, &info);
+    if (info != 0) {
+      PyErr_Format(
+                   PyExc_RuntimeError,
+                   "GpuMagmaQR: magma_sgeqrf_gpu argument %d has an illegal value", -info);
+      goto fail;
+    }
+    magma_sorgqr_gpu(M, K, K, (float *)PyGpuArray_DEV_DATA(A), M, tau_data,
+                     *(float **)work_data, nb, &info);
+    if (info != 0) {
+      PyErr_Format(
+                   PyExc_RuntimeError,
+                   "GpuMagmaQR: magma_sorgqr_gpu argument %d has an illegal value", -info);
+      goto fail;
+    }
+    *Q = pygpu_narrow(A, 1, K);
+    if (*Q == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
+      goto fail;
+    }
+  }
+  res = 0;
+fail:
+  if (tau_data != NULL)
+    magma_free_pinned(tau_data);
+  if (work_data != NULL)
+    gpudata_release(work_data);
+  Py_XDECREF(A);
+  cuda_exit(params->context->ctx);
+  return res;
+}
--- a/theano/gpuarray/magma_svd.c
+++ b/theano/gpuarray/magma_svd.c
@@ -8,7 +8,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
                              PyGpuArrayObject **S,
                              PyGpuArrayObject **U, // may be NULL
                              PyGpuArrayObject **VT, // may be NULL
-                              PARAMS_TYPE* params) {
+                              PARAMS_TYPE *params) {
  bool compute_uv = (U != NULL);
  magma_int_t *iwork = NULL, iunused[1];
  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
@@ -21,22 +21,21 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  if (A->ga.typecode != GA_FLOAT) {
    PyErr_SetString(PyExc_TypeError,
-                    "GpuMagmaMatrixInverse: Unsupported data type");
+                    "GpuMagmaSVD: Unsupported data type");
    return -1;
  }
  // This is early to match the exit() in the fail label.
  cuda_enter(params->context->ctx);
-  magma_init();
  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
+                    "GpuMagmaSVD: requires data to be C-contiguous");
-    return 1;
+    goto fail;
  }
  if (PyGpuArray_NDIM(A) != 2) {
    PyErr_SetString(PyExc_ValueError,
-                    "GpuMagmaMatrixInverse: matrix rank error");
+                    "GpuMagmaSVD: matrix rank error");
    goto fail;
  }
@@ -44,7 +43,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  // reverse dimensions because MAGMA expects column-major matrices:
  M = PyGpuArray_DIM(A, 1);
  N = PyGpuArray_DIM(A, 0);
-  K = std::min(M, N);
+  K = M < N ? M : N;
  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
    PyErr_SetString(PyExc_RuntimeError,
@@ -166,7 +165,6 @@ fail:
    magma_free_pinned(work);
  if (iwork != NULL)
    magma_free_cpu(iwork);
-  magma_finalize();
  cuda_exit(params->context->ctx);
  return res;
 }
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -74,7 +74,8 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
-                     cusolver_available, GpuMagmaMatrixInverse, gpu_svd)
+                     cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
+                     GpuMagmaCholesky, gpu_qr, GpuMagmaEigh)
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -2112,9 +2113,6 @@ def local_inplace_gpu_solve(node):
 # Cholesky decomposition
-@register_opt('fast_compile')
-@op_lifter([slinalg.Cholesky])
-@register_opt2([theano.tensor.slinalg.Cholesky], 'fast_compile')
 def local_gpu_cholesky(op, context_name, inputs, outputs):
    if not cusolver_available:
        return
@@ -2125,19 +2123,99 @@ def local_gpu_cholesky(op, context_name, inputs, outputs):
        return op(inputs[0].astype('float32')).astype('float16')
    return op
+matrix_ops_db = LocalGroupDB()
+matrix_ops_db2 = LocalGroupDB(local_opt=theano.gof.opt.GraphToGPULocalOptGroup)
+matrix_ops_db2.__name__ = "matrix_ops_db2"
+# For Cholesky decomposition, magma 2.2 is slower than cusolver 8 (tested for
+# matrices of size 1000). Thus, cusolver is prioritized during graph
+# optimizations. To explicitly use magma, you should disable cusolver using
+# `optimizer_excluding=cusolver` in Theano config.
+lifter = op_lifter([slinalg.Cholesky])(local_gpu_cholesky)
+matrix_ops_db.register("local_gpu_cholesky", lifter,
+                       'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
+                       position=0)
+matrix_ops_db2.register("local_gpu_cholesky",
+                        local_optimizer([slinalg.Cholesky])(local_gpu_cholesky),
+                        'gpuarray', 'fast_compile', 'fast_run', 'cusolver',
+                        position=0)
+register_opt('fast_compile', name='matrix_ops_db')(matrix_ops_db)
+register_opt2([slinalg.Solve], 'fast_compile', name='matrix_ops_db2')(matrix_ops_db2)
 @register_inplace()
 @local_optimizer([GpuCholesky], inplace=True)
-def local_inplace_cholesky(node):
+def local_inplace_gpu_cholesky(node):
    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
-        return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
+        return [node.op.clone_inplace()(*node.inputs)]
+def local_gpu_magma_cholesky(op, context_name, inputs, outputs):
+    if not config.magma.enabled:
+        return
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    op = GpuMagmaCholesky(lower=op.lower, inplace=op.destructive)
+    if inputs[0].dtype == 'float16':
+        return op(inputs[0].astype('float32')).astype('float16')
+    return op
+lifter = op_lifter([slinalg.Cholesky])(local_gpu_magma_cholesky)
+matrix_ops_db.register("local_gpu_magma_cholesky", lifter,
+                       'gpuarray', 'fast_compile', 'fast_run', 'magma',
+                       position=1)
+matrix_ops_db2.register("local_gpu_magma_cholesky",
+                        local_optimizer([slinalg.Cholesky])(local_gpu_magma_cholesky),
+                        'gpuarray', 'fast_compile', 'fast_run', 'magma',
+                        position=1)
+@register_inplace()
+@local_optimizer([GpuMagmaCholesky], inplace=True)
+def local_inplace_gpu_magma_cholesky(node):
+    if isinstance(node.op, GpuMagmaCholesky) and not node.op.inplace:
+        return [node.op.clone_inplace()(*node.inputs)]
+# QR decomposition
+@register_opt('magma', 'fast_compile')
+@op_lifter([nlinalg.QRFull])
+@register_opt2([theano.tensor.nlinalg.QRFull], 'magma', 'fast_compile')
+def local_gpu_magma_qr(op, context_name, inputs, outputs):
+    if not config.magma.enabled or op.mode != 'reduced':
+        return
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    x = inputs[0]
+    if inputs[0].dtype == 'float16':
+        x = inputs[0].astype('float32')
+    out = gpu_qr(x, complete=True)
+    if inputs[0].dtype == 'float16':
+        return [o.astype('float16') for o in out]
+    return out
+@register_opt('magma', 'fast_compile')
+@op_lifter([nlinalg.QRIncomplete])
+@register_opt2([theano.tensor.nlinalg.QRIncomplete], 'magma', 'fast_compile')
+def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs):
+    if not config.magma.enabled:
+        return
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    x = inputs[0]
+    if inputs[0].dtype == 'float16':
+        x = inputs[0].astype('float32')
+    out = gpu_qr(x, complete=False)
+    if inputs[0].dtype == 'float16':
+        return [out.astype('float16')]
+    return out
+# Matrix inverse
 @register_opt('magma', 'fast_compile')
 @op_lifter([nlinalg.MatrixInverse])
 @register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
-def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
+def local_gpu_magma_matrix_inverse(op, context_name, inputs, outputs):
    if not config.magma.enabled:
        return
    if inputs[0].dtype not in ['float16', 'float32']:
@@ -2150,16 +2228,31 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
 @register_inplace()
 @local_optimizer([GpuMagmaMatrixInverse])
-def local_inplace_matrix_inverse_inplace(node):
+def local_inplace_gpu_magma_matrix_inverse(node):
-    if isinstance(node.op, GpuMagmaMatrixInverse):
+    if isinstance(node.op, GpuMagmaMatrixInverse) and not node.op.inplace:
-        if not node.op.inplace:
+        return [node.op.clone_inplace()(*node.inputs)]
-            return [node.op.clone_inplace()(*node.inputs)]
+# Eigen decomposition of a symmetric matrix
+@register_opt('magma', 'fast_compile')
+@op_lifter([nlinalg.Eigh])
+@register_opt2([theano.tensor.nlinalg.Eigh], 'magma', 'fast_compile')
+def local_gpu_magma_eigh(op, context_name, inputs, outputs):
+    if not config.magma.enabled:
+        return
+    if inputs[0].dtype not in ['float16', 'float32']:
+        return
+    op = GpuMagmaEigh(UPLO=op.UPLO, compute_v=True)
+    if inputs[0].dtype == 'float16':
+        return op(inputs[0].astype('float32')).astype('float16')
+    return op
+# Singular Value Decomposition
 @register_opt('magma', 'fast_compile')
 @op_lifter([nlinalg.SVD])
 @register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
-def local_gpu_svd(op, context_name, inputs, outputs):
+def local_gpu_magma_svd(op, context_name, inputs, outputs):
    if not config.magma.enabled:
        return
    if inputs[0].dtype not in ['float16', 'float32']:

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -7,10 +7,14 @@ from numpy.linalg.linalg import LinAlgError
 import theano
 from theano import config
-from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse,
+from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
+                                    GpuMagmaEigh, GpuMagmaMatrixInverse,
+                                    GpuMagmaQR, GpuMagmaSVD,
                                    cusolver_available, gpu_matrix_inverse,
-                                    gpu_solve, gpu_svd)
+                                    gpu_solve, gpu_svd, gpu_qr)
-from theano.tensor.nlinalg import matrix_inverse
+from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
+                                   QRIncomplete, eigh, matrix_inverse, qr)
+from theano.tensor.slinalg import Cholesky, cholesky
 from theano.tests import unittest_tools as utt
 from .. import gpuarray_shared_constructor
@@ -132,7 +136,8 @@ class TestGpuCholesky(unittest.TestCase):
        A = theano.tensor.matrix("A", dtype="float32")
        cholesky_op = GpuCholesky(lower=lower, inplace=inplace)
        chol_A = cholesky_op(A)
-        return theano.function([A], chol_A, accept_inplace=inplace, mode=mode_with_gpu)
+        return theano.function([A], chol_A, accept_inplace=inplace,
+                               mode=mode_with_gpu)
    def compare_gpu_cholesky_to_np(self, A_val, lower=True, inplace=False):
        # Helper function to compare op output to np.cholesky output.
@@ -144,6 +149,12 @@ class TestGpuCholesky(unittest.TestCase):
        chol_A_res = np.array(res)
        utt.assert_allclose(chol_A_res, chol_A_val)
+    def test_gpu_cholesky_opt(self):
+        A = theano.tensor.matrix("A", dtype="float32")
+        fn = theano.function([A], cholesky(A), mode=mode_with_gpu)
+        assert any([isinstance(node.op, GpuCholesky)
+                    for node in fn.maker.fgraph.toposort()])
    def test_invalid_input_fail_non_square(self):
        # Invalid Cholesky input test with non-square matrix as input.
        A_val = np.random.normal(size=(3, 2)).astype("float32")
@@ -207,6 +218,20 @@ class TestMagma(unittest.TestCase):
        if not config.magma.enabled:
            self.skipTest('Magma is not enabled, skipping test')
+    def test_magma_opt_float16(self):
+        ops_to_gpu = [(MatrixInverse(), GpuMagmaMatrixInverse),
+                      (SVD(), GpuMagmaSVD),
+                      (QRFull(mode='reduced'), GpuMagmaQR),
+                      (QRIncomplete(mode='r'), GpuMagmaQR),
+                      # TODO: add support for float16 to Eigh numpy
+                      # (Eigh(), GpuMagmaEigh),
+                      (Cholesky(), GpuMagmaCholesky)]
+        for op, gpu_op in ops_to_gpu:
+            A = theano.tensor.matrix("A", dtype="float16")
+            fn = theano.function([A], op(A), mode=mode_with_gpu.excluding('cusolver'))
+            assert any([isinstance(node.op, gpu_op)
+                        for node in fn.maker.fgraph.toposort()])
    def test_gpu_matrix_inverse(self):
        A = theano.tensor.fmatrix("A")
@@ -216,7 +241,7 @@ class TestMagma(unittest.TestCase):
        # Copied from theano.tensor.tests.test_basic.rand.
        A_val = test_rng.rand(N, N).astype('float32') * 2 - 1
        A_val_inv = fn(A_val)
-        utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=5e-3)
+        utt.assert_allclose(np.eye(N), np.dot(A_val_inv, A_val), atol=1e-2)
    def test_gpu_matrix_inverse_inplace(self):
        N = 1000
@@ -296,3 +321,132 @@ class TestMagma(unittest.TestCase):
        A_val = rand(100, 50).astype('float32')
        utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
+    def run_gpu_cholesky(self, A_val, lower=True):
+        A = theano.tensor.fmatrix("A")
+        f = theano.function([A], GpuMagmaCholesky(lower=lower)(A),
+                            mode=mode_with_gpu.excluding('cusolver'))
+        return f(A_val)
+    def rand_symmetric(self, N):
+        A = rand(N, N).astype('float32')
+        # ensure that eigenvalues are not too small which sometimes results in
+        # magma cholesky failure due to gpu limited numerical precision
+        D, W = np.linalg.eigh(A)
+        D[D < 1] = 1
+        V_m = np.zeros_like(A)
+        np.fill_diagonal(V_m, D)
+        return np.dot(np.dot(W.T, V_m), W)
+    def check_cholesky(self, N, lower=True, rtol=None, atol=None):
+        A = self.rand_symmetric(N)
+        L = self.run_gpu_cholesky(A, lower=lower)
+        if not lower:
+            L = L.T
+        utt.assert_allclose(np.dot(L, L.T), A, rtol=rtol, atol=atol)
+    def test_gpu_cholesky(self):
+        self.check_cholesky(1000, atol=1e-3)
+        self.check_cholesky(1000, lower=False, atol=1e-3)
+    def test_gpu_cholesky_opt(self):
+        A = theano.tensor.matrix("A", dtype="float32")
+        fn = theano.function([A], cholesky(A), mode=mode_with_gpu.excluding('cusolver'))
+        assert any([isinstance(node.op, GpuMagmaCholesky)
+                    for node in fn.maker.fgraph.toposort()])
+    def test_gpu_cholesky_inplace(self):
+        A = self.rand_symmetric(1000)
+        A_gpu = gpuarray_shared_constructor(A)
+        A_copy = A_gpu.get_value()
+        fn = theano.function([], GpuMagmaCholesky(inplace=True)(A_gpu),
+                             mode=mode_with_gpu, accept_inplace=True)
+        fn()
+        L = A_gpu.get_value()
+        utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
+    def test_gpu_cholesky_inplace_opt(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], GpuMagmaCholesky()(A), mode=mode_with_gpu)
+        assert any([
+            node.op.inplace
+            for node in fn.maker.fgraph.toposort() if
+            isinstance(node.op, GpuMagmaCholesky)
+        ])
+    def run_gpu_qr(self, A_val, complete=True):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], gpu_qr(A, complete=complete),
+                             mode=mode_with_gpu)
+        return fn(A_val)
+    def check_gpu_qr(self, M, N, complete=True, rtol=None, atol=None):
+        A = rand(M, N).astype('float32')
+        if complete:
+            Q_gpu, R_gpu = self.run_gpu_qr(A, complete=complete)
+        else:
+            R_gpu = self.run_gpu_qr(A, complete=complete)
+        Q_np, R_np = np.linalg.qr(A, mode='reduced')
+        utt.assert_allclose(R_np, R_gpu, rtol=rtol, atol=atol)
+        if complete:
+            utt.assert_allclose(Q_np, Q_gpu, rtol=rtol, atol=atol)
+    def test_gpu_qr(self):
+        self.check_gpu_qr(1000, 500, atol=1e-3)
+        self.check_gpu_qr(1000, 500, complete=False, atol=1e-3)
+        self.check_gpu_qr(500, 1000, atol=1e-3)
+        self.check_gpu_qr(500, 1000, complete=False, atol=1e-3)
+    def test_gpu_qr_opt(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], qr(A), mode=mode_with_gpu)
+        assert any([
+            isinstance(node.op, GpuMagmaQR) and node.op.complete
+            for node in fn.maker.fgraph.toposort()
+        ])
+    def test_gpu_qr_incomplete_opt(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], qr(A, mode='r'), mode=mode_with_gpu)
+        assert any([
+            isinstance(node.op, GpuMagmaQR) and not node.op.complete
+            for node in fn.maker.fgraph.toposort()
+        ])
+    def run_gpu_eigh(self, A_val, UPLO='L', compute_v=True):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], GpuMagmaEigh(UPLO=UPLO, compute_v=compute_v)(A),
+                             mode=mode_with_gpu)
+        return fn(A_val)
+    def check_gpu_eigh(self, N, UPLO='L', compute_v=True, rtol=None, atol=None):
+        A = rand(N, N).astype('float32')
+        A = np.dot(A.T, A)
+        d_np, v_np = np.linalg.eigh(A, UPLO=UPLO)
+        if compute_v:
+            d_gpu, v_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=compute_v)
+        else:
+            d_gpu = self.run_gpu_eigh(A, UPLO=UPLO, compute_v=False)
+        utt.assert_allclose(d_np, d_gpu, rtol=rtol, atol=atol)
+        if compute_v:
+            utt.assert_allclose(
+                np.eye(N), np.dot(v_gpu, v_gpu.T), rtol=rtol, atol=atol)
+            D_m = np.zeros_like(A)
+            np.fill_diagonal(D_m, d_gpu)
+            utt.assert_allclose(
+                A, np.dot(np.dot(v_gpu, D_m), v_gpu.T), rtol=rtol, atol=atol)
+    def test_gpu_eigh(self):
+        self.check_gpu_eigh(1000, UPLO='L', atol=1e-3)
+        self.check_gpu_eigh(1000, UPLO='U', atol=1e-3)
+        self.check_gpu_eigh(1000, UPLO='L', compute_v=False, atol=1e-3)
+        self.check_gpu_eigh(1000, UPLO='U', compute_v=False, atol=1e-3)
+    def test_gpu_eigh_opt(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], eigh(A), mode=mode_with_gpu)
+        assert any([
+            isinstance(node.op, GpuMagmaEigh)
+            for node in fn.maker.fgraph.toposort()
+        ])
--- a/theano/tensor/nlinalg.py
+++ b/theano/tensor/nlinalg.py
@@ -539,7 +539,7 @@ class QRIncomplete(Op):
    Incomplete QR Decomposition.
    Computes the QR decomposition of a matrix.
-    Factor the matrix a as qr and return a single matrix.
+    Factor the matrix a as qr and return a single matrix R.
    """
@@ -552,15 +552,14 @@ class QRIncomplete(Op):
    def make_node(self, x):
        x = as_tensor_variable(x)
        assert x.ndim == 2, "The input of qr function should be a matrix."
-        q = theano.tensor.matrix(dtype=x.dtype)
+        r = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [q])
+        return Apply(self, [x], [r])
    def perform(self, node, inputs, outputs):
        (x,) = inputs
-        (q,) = outputs
+        (r,) = outputs
        assert x.ndim == 2, "The input of qr function should be a matrix."
-        q[0] = self._numop(x,
+        r[0] = self._numop(x, self.mode)
-                           self.mode)
 def qr(a, mode="reduced"):