Add magma svd operation

519706fa · Alexander Matyasko · 5ac884f6 · 519706fa · 519706fa · 519706fa
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -348,6 +348,72 @@ def gpu_cholesky(A, lower=True):
    return GpuCholesky(lower)(A)


+class GpuMagmaSVD(COp):
+    """Computes the svd of a matrix :math:`A` using magma library.
+    """
+    __props__ = ('full_matrices', 'compute_uv')
+    params_type = gpu_context_type
+
+    def __init__(self, full_matrices=True, compute_uv=True):
+        self.full_matrices = full_matrices
+        self.compute_uv = compute_uv
+        COp.__init__(self, ['magma_svd.c'],
+                     'APPLY_SPECIFIC(magma_svd)')
+
+    def c_headers(self):
+        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
+                'gpuarray_helper.h', 'magma.h']
+
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__), pygpu.get_include()]
+
+    def c_libraries(self):
+        return ['magma']
+
+    def make_node(self, A):
+        if A.ndim != 2:
+            raise LinAlgError("Matrix rank error")
+        ctx_name = infer_context_name(A)
+        A = as_gpuarray_variable(A, ctx_name)
+        return theano.Apply(self, [A],
+                            [A.type(),
+                             GpuArrayType(A.dtype, broadcastable=[False],
+                                          context_name=ctx_name)(),
+                             A.type()])
+
+    def get_params(self, node):
+        return node.inputs[0].type.context
+
+    def get_op_params(self):
+        compute_uv = int(self.compute_uv)
+        full_matrices = int(self.full_matrices)
+        return [('COMPUTE_UV', compute_uv),
+                ('FULL_MATRICES', full_matrices)]
+
+
+def gpu_svd(a, full_matrices=1, compute_uv=1):
+    """
+    This function performs the SVD on GPU.
+
+    Parameters
+    ----------
+    full_matrices : bool, optional
+        If True (default), u and v have the shapes (M, M) and (N, N),
+        respectively.
+        Otherwise, the shapes are (M, K) and (K, N), respectively,
+        where K = min(M, N).
+    compute_uv : bool, optional
+        Whether or not to compute u and v in addition to s.
+        True by default.
+
+    Returns
+    -------
+    U, V,  D : matrices
+
+    """
+    return GpuMagmaSVD(full_matrices, compute_uv)(a)
+
+
 class GpuMagmaMatrixInverse(COp):
    """Computes the inverse of a matrix :math:`A` using magma library.
    """

--- a/theano/gpuarray/magma_svd.c
+++ b/theano/gpuarray/magma_svd.c
+#section init_code
+
+setup_ext_cuda();
+
+#section support_code_struct
+
+int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
+                              PyGpuArrayObject **S, PyGpuArrayObject **VT,
+                              PyGpuContextObject *c) {
+  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
+  magma_vec_t jobu, jobv;
+  size_t s_dims[1], u_dims[2], vt_dims[2];
+  float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
+        *work = NULL;
+  float dummy[1];
+  int res = -1, lwork;
+
+  if (A->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaMatrixInverse: Unsupported data type");
+    return -1;
+  }
+
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  magma_init();
+
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
+    return 1;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: matrix rank error");
+    goto fail;
+  }
+
+  // magma matrix svd
+  M = PyGpuArray_DIM(A, 0);
+  N = PyGpuArray_DIM(A, 1);
+  K = M < N ? M : N;
+
+  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+
+  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&s_data, K)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+
+  if (COMPUTE_UV) {
+    if (FULL_MATRICES) {
+      jobu = MagmaAllVec;
+      jobv = MagmaAllVec;
+    }
+    else {
+      jobu = MagmaSomeVec;
+      jobv = MagmaSomeVec;
+    }
+    M_U  = (jobu == MagmaAllVec ? M : K);
+    N_VT = (jobv == MagmaAllVec ? N : K);
+    ldu = M;
+    ldv = N_VT;
+
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M * M_U)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, ldv * N)) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "GpuMagmaSVD: failed to allocate memory");
+      goto fail;
+    }
+  }
+  else {
+    jobu = MagmaNoVec;
+    jobv = MagmaNoVec;
+    ldu = M;
+    ldv = N;
+  }
+
+  // query for workspace size
+  magma_sgesvd(jobu, jobv, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
+               dummy, -1, &info);
+
+  lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate working memory");
+    goto fail;
+  }
+
+  // compute svd
+  magma_sgesvd(jobu, jobv, M, N, a_data, M, s_data,
+               u_data, ldu, vt_data, ldv, work, lwork, &info);
+  if (info > 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaSVD: magma_sgesvd_gpu %d superdiagonals failed to converge",
+        info);
+    goto fail;
+  } else if (info < 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaSVD: magma_sgesvd_gpu argument %d has an illegal value", -info);
+    goto fail;
+  }
+
+  s_dims[0] = K;
+  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+
+  u_dims[0] = M; u_dims[1] = ldu;
+  // choose fortran order to avoid transpose
+  if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  cudaMemcpy(PyGpuArray_DEV_DATA(*U), u_data, M * ldu * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+  /* GpuArray_transpose_inplace(&(*U)->ga, NULL); */
+
+  vt_dims[0] = ldv; vt_dims[1] = N;
+  // choose fortran order to avoid transpose
+  if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  cudaMemcpy(PyGpuArray_DEV_DATA(*VT), vt_data, ldv * N * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+  /* GpuArray_transpose_inplace(&(*VT)->ga, NULL); */
+
+  res = 0;
+fail:
+  if (a_data != NULL)
+    magma_free_pinned(a_data);
+  if (s_data != NULL)
+    magma_free_pinned(s_data);
+  if (u_data != NULL)
+    magma_free_pinned(u_data);
+  if (vt_data != NULL)
+    magma_free_pinned(vt_data);
+  if (work != NULL)
+    magma_free_pinned(work);
+  magma_finalize();
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -72,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
-                     cusolver_available, GpuMagmaMatrixInverse)
+                     cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)

 _logger = logging.getLogger("theano.gpuarray.opt")

@@ -2013,6 +2013,15 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
    return GpuMagmaMatrixInverse()


+@register_opt('magma', 'fast_compile')
+@op_lifter([nlinalg.SVD])
+@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
+def local_gpu_svd(op, context_name, inputs, outputs):
+    import pudb; pudb.set_trace()
+    return GpuMagmaSVD(full_matrices=op.full_matrices,
+                       compute_uv=op.compute_uv)
+
+
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -5,13 +5,13 @@ import numpy as np
 import theano

 from theano.tests import unittest_tools as utt
-from .config import mode_with_gpu
+from .config import mode_with_gpu, mode_without_gpu

 from numpy.linalg.linalg import LinAlgError

 # Skip tests if cuda_ndarray is not available.
 from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky,
-                                    gpu_matrix_inverse)
+                                    gpu_matrix_inverse, gpu_svd)


 class TestCusolver(unittest.TestCase):
@@ -201,8 +201,30 @@ class TestMagma(unittest.TestCase):
    def test_gpu_matrix_inverse(self):
        A = theano.tensor.fmatrix("A")

-        fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
+        fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu.including('magma'))
        N = 1000
        A_val = np.random.rand(N, N).astype(np.float32)
        A_val_inv = fn(A_val)
        utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
+
+    def test_gpu_svd(self):
+        A = theano.tensor.fmatrix("A")
+        N, M = 50, 100
+        A_val = np.random.rand(M, N).astype(np.float32)
+
+        f = theano.function([A], gpu_svd(A), mode=mode_with_gpu.including('magma'))
+        U, S, VT = f(A_val)
+        utt.assert_allclose(np.dot(U.T, U), np.eye(M))
+        utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
+        S_m = np.zeros_like(A_val)
+        np.fill_diagonal(S_m, S)
+        utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A_val)
+
+        f = theano.function([A], gpu_svd(A, full_matrices=False), mode=mode_with_gpu.including('magma'))
+        U, _, VT = f(A_val)
+        utt.assert_allclose(np.dot(U.T, U), np.eye(N))
+        utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
+
+        f = theano.function([A], theano.tensor.nlinalg.svd(A, compute_uv=False), mode=mode_without_gpu)
+        f2 = theano.function([A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu.including('magma'))
+        utt.assert_allclose(f(A_val)[1], f2(A_val)[1], 1)