Fix U and VT for theano interface

Magma expect column-major matrices. Exchange u_data -> VT and vt_data -> U to match numpy svd interface.

Fix U and VT for theano interface
44cb1473 · Alexander Matyasko · 84da7d76 · 44cb1473 · 44cb1473 · 44cb1473
--- a/theano/gpuarray/magma_inv.c
+++ b/theano/gpuarray/magma_inv.c
@@ -46,7 +46,7 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **_A_inv,
 #else
  A_inv = theano_try_copy(A_inv, A);
  if (A_inv == NULL) {
-    PyEr_SetString(
+    PyErr_SetString(
        PyExc_RuntimeError,
        "GpuMagmaMatrixInverse: failed to allocate memory for the output");
    goto fail;

--- a/theano/gpuarray/magma_svd.c
+++ b/theano/gpuarray/magma_svd.c
@@ -37,9 +37,10 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
  }

  // magma matrix svd
-  M = PyGpuArray_DIM(A, 0);
-  N = PyGpuArray_DIM(A, 1);
-  K = M < N ? M : N;
+  // reverse dimensions because MAGMA expects column-major matrices:
+  M = PyGpuArray_DIM(A, 1);
+  N = PyGpuArray_DIM(A, 0);
+  K = std::min(M, N);

  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
    PyErr_SetString(PyExc_RuntimeError,
@@ -69,12 +70,12 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
    ldu = M;
    ldv = N_VT;

-    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M * M_U)) {
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
    }
-    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, ldv * N)) {
+    if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
      PyErr_SetString(PyExc_RuntimeError,
                      "GpuMagmaSVD: failed to allocate memory");
      goto fail;
@@ -123,27 +124,27 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
             cudaMemcpyDeviceToDevice);

-  u_dims[0] = M; u_dims[1] = ldu;
-  // choose fortran order to avoid transpose
-  if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
+  u_dims[0] = N; u_dims[1] = N_VT;
+  if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }
-  cudaMemcpy(PyGpuArray_DEV_DATA(*U), u_data, M * ldu * sizeof(float),
+  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+  // to match numpy.linalg.svd output
+  cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
             cudaMemcpyDeviceToDevice);
-  /* GpuArray_transpose_inplace(&(*U)->ga, NULL); */

-  vt_dims[0] = ldv; vt_dims[1] = N;
-  // choose fortran order to avoid transpose
-  if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
+  vt_dims[0] = M_U; vt_dims[1] = M;
+  if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
    PyErr_SetString(PyExc_RuntimeError,
                    "GpuMagmaSVD: failed to allocate memory");
    goto fail;
  }
-  cudaMemcpy(PyGpuArray_DEV_DATA(*VT), vt_data, ldv * N * sizeof(float),
+  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+  // to match numpy.linalg.svd output
+  cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
             cudaMemcpyDeviceToDevice);
-  /* GpuArray_transpose_inplace(&(*VT)->ga, NULL); */

  res = 0;
 fail:

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -6,6 +6,7 @@ import theano

 from theano.tests import unittest_tools as utt
 from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand

 from numpy.linalg.linalg import LinAlgError

@@ -207,24 +208,63 @@ class TestMagma(unittest.TestCase):
        A_val_inv = fn(A_val)
        utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)

-    def test_gpu_svd(self):
+    def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
        A = theano.tensor.fmatrix("A")
-        N, M = 50, 100
-        A_val = np.random.rand(M, N).astype(np.float32)
-
-        f = theano.function([A], gpu_svd(A), mode=mode_with_gpu.including('magma'))
-        U, S, VT = f(A_val)
-        utt.assert_allclose(np.dot(U.T, U), np.eye(M))
-        utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
-        S_m = np.zeros_like(A_val)
-        np.fill_diagonal(S_m, S)
-        utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A_val)
+        f = theano.function(
+            [A], gpu_svd(A, full_matrices=full_matrices, compute_uv=compute_uv),
+            mode=mode_with_gpu.including('magma'))
+        return f(A_val)

-        f = theano.function([A], gpu_svd(A, full_matrices=False), mode=mode_with_gpu.including('magma'))
-        U, _, VT = f(A_val)
-        utt.assert_allclose(np.dot(U.T, U), np.eye(N))
-        utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
+    def assert_column_orthonormal(self, Ot):
+        utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))

-        f = theano.function([A], theano.tensor.nlinalg.svd(A, compute_uv=False), mode=mode_without_gpu)
-        f2 = theano.function([A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu.including('magma'))
-        utt.assert_allclose(f(A_val)[1], f2(A_val)[1], 1)
+    def check_svd(self, A, U, S, VT, rtol=None, atol=None):
+        S_m = np.zeros_like(A)
+        np.fill_diagonal(S_m, S)
+        utt.assert_allclose(
+            np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
+
+    def test_gpu_svd_wide(self):
+        A = rand(100, 50)
+        M, N = A.shape
+
+        U, S, VT = self.run_gpu_svd(A)
+        self.assert_column_orthonormal(U)
+        self.assert_column_orthonormal(VT.T)
+        self.check_svd(A, U, S, VT)
+
+        U, S, VT = self.run_gpu_svd(A, full_matrices=False)
+        self.assertEqual(U.shape[1], min(M, N))
+        self.assert_column_orthonormal(U)
+        self.assertEqual(VT.shape[0], min(M, N))
+        self.assert_column_orthonormal(VT.T)
+
+    def test_gpu_svd_tall(self):
+        A = rand(50, 100)
+        M, N = A.shape
+
+        U, S, VT = self.run_gpu_svd(A)
+        self.assert_column_orthonormal(U)
+        self.assert_column_orthonormal(VT.T)
+        self.check_svd(A, U, S, VT)
+
+        U, S, VT = self.run_gpu_svd(A, full_matrices=False)
+        self.assertEqual(U.shape[1], min(M, N))
+        self.assert_column_orthonormal(U)
+        self.assertEqual(VT.shape[0], min(M, N))
+        self.assert_column_orthonormal(VT.T)
+
+    def test_gpu_singular_values(self):
+        A = theano.tensor.fmatrix("A")
+        f_cpu = theano.function(
+            [A], theano.tensor.nlinalg.svd(A, compute_uv=False),
+            mode=mode_without_gpu)
+        f_gpu = theano.function(
+            [A], gpu_svd(A, compute_uv=False),
+            mode=mode_with_gpu.including('magma'))
+
+        A_val = rand(50, 100)
+        utt.assert_allclose(f_cpu(A_val)[1], f_gpu(A_val)[1])
+
+        A_val = rand(100, 50)
+        utt.assert_allclose(f_cpu(A_val)[1], f_gpu(A_val)[1])