提交 44cb1473 authored 作者: Alexander Matyasko's avatar Alexander Matyasko

Fix U and VT for theano interface

Magma expect column-major matrices. Exchange u_data -> VT and vt_data -> U to match numpy svd interface.
上级 84da7d76
......@@ -46,7 +46,7 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **_A_inv,
#else
A_inv = theano_try_copy(A_inv, A);
if (A_inv == NULL) {
PyEr_SetString(
PyErr_SetString(
PyExc_RuntimeError,
"GpuMagmaMatrixInverse: failed to allocate memory for the output");
goto fail;
......
......@@ -37,9 +37,10 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
}
// magma matrix svd
M = PyGpuArray_DIM(A, 0);
N = PyGpuArray_DIM(A, 1);
K = M < N ? M : N;
// reverse dimensions because MAGMA expects column-major matrices:
M = PyGpuArray_DIM(A, 1);
N = PyGpuArray_DIM(A, 0);
K = std::min(M, N);
if (MAGMA_SUCCESS != magma_smalloc_pinned(&a_data, M * N)) {
PyErr_SetString(PyExc_RuntimeError,
......@@ -69,12 +70,12 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
ldu = M;
ldv = N_VT;
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M * M_U)) {
if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, ldv * N)) {
if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
......@@ -123,27 +124,27 @@ int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A, PyGpuArrayObject **U,
cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
cudaMemcpyDeviceToDevice);
u_dims[0] = M; u_dims[1] = ldu;
// choose fortran order to avoid transpose
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
u_dims[0] = N; u_dims[1] = N_VT;
if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*U), u_data, M * ldu * sizeof(float),
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
cudaMemcpyDeviceToDevice);
/* GpuArray_transpose_inplace(&(*U)->ga, NULL); */
vt_dims[0] = ldv; vt_dims[1] = N;
// choose fortran order to avoid transpose
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_F_ORDER, c) != 0){
vt_dims[0] = M_U; vt_dims[1] = M;
if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
PyErr_SetString(PyExc_RuntimeError,
"GpuMagmaSVD: failed to allocate memory");
goto fail;
}
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), vt_data, ldv * N * sizeof(float),
// magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
// to match numpy.linalg.svd output
cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
cudaMemcpyDeviceToDevice);
/* GpuArray_transpose_inplace(&(*VT)->ga, NULL); */
res = 0;
fail:
......
......@@ -6,6 +6,7 @@ import theano
from theano.tests import unittest_tools as utt
from .config import mode_with_gpu, mode_without_gpu
from .test_basic_ops import rand
from numpy.linalg.linalg import LinAlgError
......@@ -207,24 +208,63 @@ class TestMagma(unittest.TestCase):
A_val_inv = fn(A_val)
utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
def test_gpu_svd(self):
def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
A = theano.tensor.fmatrix("A")
N, M = 50, 100
A_val = np.random.rand(M, N).astype(np.float32)
f = theano.function([A], gpu_svd(A), mode=mode_with_gpu.including('magma'))
U, S, VT = f(A_val)
utt.assert_allclose(np.dot(U.T, U), np.eye(M))
utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
S_m = np.zeros_like(A_val)
np.fill_diagonal(S_m, S)
utt.assert_allclose(np.dot(np.dot(U, S_m), VT), A_val)
f = theano.function(
[A], gpu_svd(A, full_matrices=full_matrices, compute_uv=compute_uv),
mode=mode_with_gpu.including('magma'))
return f(A_val)
f = theano.function([A], gpu_svd(A, full_matrices=False), mode=mode_with_gpu.including('magma'))
U, _, VT = f(A_val)
utt.assert_allclose(np.dot(U.T, U), np.eye(N))
utt.assert_allclose(np.dot(VT.T, VT), np.eye(N))
def assert_column_orthonormal(self, Ot):
utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))
f = theano.function([A], theano.tensor.nlinalg.svd(A, compute_uv=False), mode=mode_without_gpu)
f2 = theano.function([A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu.including('magma'))
utt.assert_allclose(f(A_val)[1], f2(A_val)[1], 1)
def check_svd(self, A, U, S, VT, rtol=None, atol=None):
S_m = np.zeros_like(A)
np.fill_diagonal(S_m, S)
utt.assert_allclose(
np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
def test_gpu_svd_wide(self):
A = rand(100, 50)
M, N = A.shape
U, S, VT = self.run_gpu_svd(A)
self.assert_column_orthonormal(U)
self.assert_column_orthonormal(VT.T)
self.check_svd(A, U, S, VT)
U, S, VT = self.run_gpu_svd(A, full_matrices=False)
self.assertEqual(U.shape[1], min(M, N))
self.assert_column_orthonormal(U)
self.assertEqual(VT.shape[0], min(M, N))
self.assert_column_orthonormal(VT.T)
def test_gpu_svd_tall(self):
A = rand(50, 100)
M, N = A.shape
U, S, VT = self.run_gpu_svd(A)
self.assert_column_orthonormal(U)
self.assert_column_orthonormal(VT.T)
self.check_svd(A, U, S, VT)
U, S, VT = self.run_gpu_svd(A, full_matrices=False)
self.assertEqual(U.shape[1], min(M, N))
self.assert_column_orthonormal(U)
self.assertEqual(VT.shape[0], min(M, N))
self.assert_column_orthonormal(VT.T)
def test_gpu_singular_values(self):
A = theano.tensor.fmatrix("A")
f_cpu = theano.function(
[A], theano.tensor.nlinalg.svd(A, compute_uv=False),
mode=mode_without_gpu)
f_gpu = theano.function(
[A], gpu_svd(A, compute_uv=False),
mode=mode_with_gpu.including('magma'))
A_val = rand(50, 100)
utt.assert_allclose(f_cpu(A_val)[1], f_gpu(A_val)[1])
A_val = rand(100, 50)
utt.assert_allclose(f_cpu(A_val)[1], f_gpu(A_val)[1])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论