Wrap op params for theano.gpuarray.linalg.GpuMagmaSVD.

1b46af51 · notoraptor · 1d6b79f6 · 1b46af51 · 1b46af51 · 1b46af51
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -9,7 +9,8 @@ from numpy.linalg.linalg import LinAlgError

 import theano
 from theano import Op, config, tensor
-from theano.gof import COp
+from theano.scalar import bool as bool_t
+from theano.gof import COp, ParamsType
 from theano.gpuarray import GpuArrayType

 from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
@@ -350,9 +351,19 @@ def gpu_cholesky(A, lower=True):

 class GpuMagmaSVD(COp):
    """Computes the svd of a matrix :math:`A` using magma library.
+
+    .. warning::
+
+        Because of implementation constraints, this Op returns outputs
+        in order ``S, U, VT``. Use :func:`theano.gpuarray.linalg.gpu_svd`
+        to get them in expected order ``U, S, VT``.
+
    """
    __props__ = ('full_matrices', 'compute_uv')
-    params_type = gpu_context_type
+    _cop_num_inputs = 1
+    _cop_num_outputs = 3
+    check_input = False
+    params_type = ParamsType(full_matrices=bool_t, context=gpu_context_type)

    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
@@ -385,25 +396,19 @@ class GpuMagmaSVD(COp):
        assert A.dtype == 'float32'
        if self.compute_uv:
            return theano.Apply(self, [A],
-                                [A.type(),
-                                GpuArrayType(A.dtype, broadcastable=[False],
-                                             context_name=ctx_name)(),
-                                A.type()])
+                                # return S, U, VT
+                                [GpuArrayType(A.dtype, broadcastable=[False],
+                                              context_name=ctx_name)(),
+                                 A.type(),
+                                 A.type()])
        else:
            return theano.Apply(self, [A],
+                                # return only S
                                [GpuArrayType(A.dtype, broadcastable=[False],
                                              context_name=ctx_name)()])

    def get_params(self, node):
-        return node.inputs[0].type.context
-
-    def get_op_params(self):
-        params = []
-        if self.compute_uv:
-            params.append(('COMPUTE_UV', '1'))
-        if self.full_matrices:
-            params.append(('FULL_MATRICES', '1'))
-        return params
+        return self.params_type.get_params(self, context=node.inputs[0].type.context)

    def infer_shape(self, node, shapes):
        x_shape, = shapes
@@ -413,7 +418,7 @@ class GpuMagmaSVD(COp):
        if self.compute_uv:
            u_shape = (M, M) if self.full_matrices else (M, K)
            vt_shape = (N, N) if self.full_matrices else (K, N)
-            return [u_shape, s_shape, vt_shape]
+            return [s_shape, u_shape, vt_shape]
        else:
            return [s_shape]

@@ -438,7 +443,11 @@ def gpu_svd(a, full_matrices=1, compute_uv=1):
    U, V,  D : matrices

    """
-    return GpuMagmaSVD(full_matrices, compute_uv)(a)
+    out = GpuMagmaSVD(full_matrices, compute_uv)(a)
+    if compute_uv:
+        S, U, VT = out
+        out = [U, S, VT]
+    return out


 class GpuMagmaMatrixInverse(COp):

--- a/theano/gpuarray/magma_svd.c
+++ b/theano/gpuarray/magma_svd.c
@@ -5,14 +5,11 @@ setup_ext_cuda();
 #section support_code_struct

 int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
-#ifdef COMPUTE_UV
-                              PyGpuArrayObject **U,
-#endif
                              PyGpuArrayObject **S,
-#ifdef COMPUTE_UV
-                              PyGpuArrayObject **VT,
-#endif
-                              PyGpuContextObject *c) {
+                              PyGpuArrayObject **U, // may be NULL
+                              PyGpuArrayObject **VT, // may be NULL
+                              PARAMS_TYPE* params) {
+  bool compute_uv = (U != NULL);
  magma_int_t *iwork = NULL, iunused[1];
  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
  magma_vec_t jobz;
@@ -29,7 +26,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  }

  // This is early to match the exit() in the fail label.
-  cuda_enter(c->ctx);
+  cuda_enter(params->context->ctx);
  magma_init();

  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
@@ -63,32 +60,32 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
    goto fail;
  }

-#ifdef COMPUTE_UV
-#ifdef FULL_MATRICES
-  jobz = MagmaAllVec;
-#else
-  jobz = MagmaSomeVec;
-#endif
-  M_U  = (jobz == MagmaAllVec ? M : K);
-  N_VT = (jobz == MagmaAllVec ? N : K);
-  ldu = M;
-  ldv = N_VT;
-
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
+  if (compute_uv) {
+    if (params->full_matrices) {
+      jobz = MagmaAllVec;
+    } else {
+      jobz = MagmaSomeVec;
+    }
+    M_U  = (jobz == MagmaAllVec ? M : K);
+    N_VT = (jobz == MagmaAllVec ? N : K);
+    ldu = M;
+    ldv = N_VT;
+
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+  } else {
+    jobz = MagmaNoVec;
+    ldu = M;
+    ldv = N;
  }
-  if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-#else
-  jobz = MagmaNoVec;
-  ldu = M;
-  ldv = N;
-#endif

  // query for workspace size
  magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
@@ -124,7 +121,7 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  }

  s_dims[0] = K;
-  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
+  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
@@ -132,29 +129,29 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
             cudaMemcpyDeviceToDevice);

-#ifdef COMPUTE_UV
-  u_dims[0] = N; u_dims[1] = N_VT;
-  if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
-  }
-  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
-  // to match numpy.linalg.svd output
-  cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-
-  vt_dims[0] = M_U; vt_dims[1] = M;
-  if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
-    PyErr_SetString(PyExc_RuntimeError,
-                    "GpuMagmaSVD: failed to allocate memory");
-    goto fail;
+  if (compute_uv) {
+    u_dims[0] = N; u_dims[1] = N_VT;
+    if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+    // to match numpy.linalg.svd output
+    cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
+               cudaMemcpyDeviceToDevice);
+
+    vt_dims[0] = M_U; vt_dims[1] = M;
+    if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, params->context) != 0){
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+    // to match numpy.linalg.svd output
+    cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
+               cudaMemcpyDeviceToDevice);
  }
-  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
-  // to match numpy.linalg.svd output
-  cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
-             cudaMemcpyDeviceToDevice);
-#endif
  res = 0;
 fail:
  if (a_data != NULL)
@@ -170,6 +167,6 @@ fail:
  if (iwork != NULL)
    magma_free_cpu(iwork);
  magma_finalize();
-  cuda_exit(c->ctx);
+  cuda_exit(params->context->ctx);
  return res;
 }
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -73,7 +73,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
-                     cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
+                     cusolver_available, GpuMagmaMatrixInverse, gpu_svd)

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -2149,11 +2149,16 @@ def local_gpu_svd(op, context_name, inputs, outputs):
        return
    if inputs[0].dtype not in ['float16', 'float32']:
        return
-    op = GpuMagmaSVD(full_matrices=op.full_matrices,
-                     compute_uv=op.compute_uv)
+    x = inputs[0]
    if inputs[0].dtype == 'float16':
-        return op(inputs[0].astype('float32')).astype('float16')
-    return op
+        x = inputs[0].astype('float32')
+    out = gpu_svd(x, compute_uv=op.compute_uv, full_matrices=op.full_matrices)
+    if inputs[0].dtype == 'float16':
+        if op.compute_uv:
+            out = [o.astype('float16') for o in out]
+        else:
+            out = [out.astype('float16')]
+    return out

 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.