Update magma qr to use params type

e05a3ea2 · Alexander Matyasko · b728a250 · e05a3ea2 · e05a3ea2 · e05a3ea2
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -569,8 +569,17 @@ class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
    ----------
    complete : If `False`, returns only r.
+    .. warning::
+        Because of implementation constraints, this Op returns outputs
+        in order ``R, Q``. Use :func:`theano.gpuarray.linalg.gpu_qr`
+        to get them in expected order ``Q, R``.
    """
    __props__ = ('complete', )
+    _cop_num_inputs = 1
+    _cop_num_outputs = 2
+    check_input = False
+    params_type = ParamsType(complete=bool_t, context=gpu_context_type)
    def __init__(self, complete=True):
        self.complete = complete
@@ -585,15 +594,37 @@ class GpuMagmaQR(GpuMagmaBase, CGpuKernelBase):
        if A.dtype != 'float32':
            raise TypeError("only `float32` is supported for now")
        if self.complete:
-            return theano.Apply(self, [A], [A.type(), A.type()])
+            return theano.Apply(self, [A],
+                                # return R, Q
+                                [A.type(), A.type()])
        else:
-            return theano.Apply(self, [A], [A.type()])
+            return theano.Apply(self, [A],
+                                # return R
+                                [A.type()])
-    def get_op_params(self):
+    def get_params(self, node):
-        params = []
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)
-        if self.complete:
-            params.append(('COMPLETE', '1'))
-        return params
+def gpu_qr(a, complete=True):
+    """
+    This function performs the QR on GPU.
+    Parameters
+    ----------
+    complete : bool, optional
+        If `False`, returns only r.
+    Returns
+    -------
+    Q, R : matrices
+    """
+    out = GpuMagmaQR(complete)(a)
+    if complete:
+        R, Q = out
+        out = [Q, R]
+    return out
 class GpuMagmaEigh(GpuMagmaBase):

--- a/theano/gpuarray/magma_qr.c
+++ b/theano/gpuarray/magma_qr.c
@@ -33,11 +33,9 @@ static PyGpuArrayObject *pygpu_narrow(PyGpuArrayObject *src, size_t dim,
 #section support_code_struct
 int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
-#ifdef COMPLETE
-                             PyGpuArrayObject **Q,
-#endif
                             PyGpuArrayObject **R,
-                             PyGpuContextObject *c) {
+                             PyGpuArrayObject **Q, // may be NULL
+                             PARAMS_TYPE* params) {
  PyGpuArrayObject *A = NULL;
  magma_int_t M, N, K, nb, ldwork;
  size_t n2;
@@ -56,20 +54,27 @@ int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
                    "GpuMagmaQR: requires data to be C-contiguous");
    return -1;
  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(params->context->ctx);
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaQR: requires data to be C-contiguous");
+    goto fail;
+  }
  if (PyGpuArray_NDIM(A) != 2) {
    PyErr_SetString(PyExc_ValueError, "GpuMagmaQR: matrix rank error");
-    return -1;
+    goto fail;
  }
  A = pygpu_copy(A_, GA_F_ORDER);
  if (A == NULL) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaQR: failed to change to column-major order");
-    return -1;
+    goto fail;
  }
-  // This is early to match the exit() in the fail label.
-  cuda_enter(c->ctx);
  // magma matrix qr
  M = PyGpuArray_DIM(A, 0);
  N = PyGpuArray_DIM(A, 1);
@@ -83,7 +88,7 @@ int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
  nb = magma_get_sgeqrf_nb(M, N);
  ldwork = (2 * K + magma_roundup(N, 32)) * nb;
-  work_data = gpudata_alloc(c->ctx, ldwork * sizeof(float), NULL, 0, NULL);
+  work_data = gpudata_alloc(params->context->ctx, ldwork * sizeof(float), NULL, 0, NULL);
  if (work_data == NULL) {
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaQR: failed to allocate working memory");
@@ -111,7 +116,7 @@ int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
    goto fail;
  }
-#ifdef COMPLETE
+  if (params->complete) {
    // compute Q
    Py_XDECREF(A);
    A = pygpu_copy(A_, GA_F_ORDER);
@@ -142,7 +147,7 @@ int APPLY_SPECIFIC(magma_qr)(PyGpuArrayObject *A_,
      PyErr_SetString(PyExc_RuntimeError, "GpuMagmaQR: failed to narrow array");
      goto fail;
    }
-#endif
+  }
  res = 0;
 fail:
  if (tau_data != NULL)
@@ -150,6 +155,6 @@ fail:
  if (work_data != NULL)
    gpudata_release(work_data);
  Py_XDECREF(A);
-  cuda_exit(c->ctx);
+  cuda_exit(params->context->ctx);
  return res;
 }
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -75,7 +75,7 @@ from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
                     cusolver_available, GpuMagmaMatrixInverse, gpu_svd,
-                     GpuMagmaCholesky, GpuMagmaQR, GpuMagmaEigh)
+                     GpuMagmaCholesky, gpu_qr, GpuMagmaEigh)
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -2181,11 +2181,13 @@ def local_gpu_magma_qr(op, context_name, inputs, outputs):
        return
    if inputs[0].dtype not in ['float16', 'float32']:
        return
-    op = GpuMagmaQR(complete=True)
+    x = inputs[0]
    if inputs[0].dtype == 'float16':
-        outputs = op(inputs[0].astype('float32'))
+        x = inputs[0].astype('float32')
-        return [o.astype('float16') for o in outputs]
+    out = gpu_qr(x, complete=True)
-    return op
+    if inputs[0].dtype == 'float16':
+        return [o.astype('float16') for o in out]
+    return out
 @register_opt('magma', 'fast_compile')
@@ -2196,10 +2198,13 @@ def local_gpu_magma_qr_incomplete(op, context_name, inputs, outputs):
        return
    if inputs[0].dtype not in ['float16', 'float32']:
        return
-    op = GpuMagmaQR(complete=False)
+    x = inputs[0]
    if inputs[0].dtype == 'float16':
-        return op(inputs[0].astype('float32')).astype('float16')
+        x = inputs[0].astype('float32')
-    return op
+    out = gpu_qr(x, complete=False)
+    if inputs[0].dtype == 'float16':
+        return [out.astype('float16')]
+    return out
 # Matrix inverse

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -11,7 +11,7 @@ from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaCholesky,
                                    GpuMagmaEigh, GpuMagmaMatrixInverse,
                                    GpuMagmaQR, GpuMagmaSVD,
                                    cusolver_available, gpu_matrix_inverse,
-                                    gpu_solve, gpu_svd)
+                                    gpu_solve, gpu_svd, gpu_qr)
 from theano.tensor.nlinalg import (SVD, MatrixInverse, QRFull,
                                   QRIncomplete, eigh, matrix_inverse, qr)
 from theano.tensor.slinalg import Cholesky, cholesky
@@ -376,7 +376,7 @@ class TestMagma(unittest.TestCase):
    def run_gpu_qr(self, A_val, complete=True):
        A = theano.tensor.fmatrix("A")
-        fn = theano.function([A], GpuMagmaQR(complete=complete)(A),
+        fn = theano.function([A], gpu_qr(A, complete=complete),
                             mode=mode_with_gpu)
        return fn(A_val)