Merge pull request #5778 from aam-at/magma_gpu

Add magma gpu svd and matrix inverse

Merge pull request #5778 from aam-at/magma_gpu
44726397 · Frédéric Bastien · GitHub · b6a4757f · 1399245f · 44726397
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -679,6 +679,29 @@ import theano and print the config variable, as in:
    ``'guess_once'``, ``'guess_on_shape_change'``, ``'time_once'``,
    ``'time_on_shape_change'``.
+.. attribute:: config.magma.enabled
+    String value: ``'True'``, ``'False'``
+    Default: ``'False'``
+    If ``'True'``, use `magma <http://icl.cs.utk.edu/magma/>`_ for matrix
+    computations.
+    If ``'False'``, disable magma.
+.. attribute:: config.magma.include_path
+    Default: ``''``
+    Location of the magma headers.
+.. attribute:: config.magma.library_path
+    Default: ``''``
+    Location of the magma library.
 .. attribute:: config.gcc.cxxflags
    Default: ``""``

--- a/theano/configdefaults.py
+++ b/theano/configdefaults.py
@@ -361,6 +361,22 @@ AddConfigVar('dnn.enabled',
             EnumStr("auto", "True", "False", "no_check"),
             in_c_key=False)
+AddConfigVar('magma.include_path',
+             "Location of the magma header",
+             StrParam(''),
+             in_c_key=False)
+AddConfigVar('magma.library_path',
+             "Location of the magma library",
+             StrParam(''),
+             in_c_key=False)
+AddConfigVar('magma.enabled',
+             " If True, use magma for matrix computation."
+             " If False, disable magma",
+             BoolParam(False),
+             in_c_key=False)
 # This flag determines whether or not to raise error/warning message if
 # there is a CPU Op in the computational graph.
 AddConfigVar(

--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
 from __future__ import absolute_import, division, print_function
-import pkg_resources
+import os
-import theano
 import warnings
-from theano import Op
+import pkg_resources
-from theano.gpuarray import basic_ops, GpuArrayType
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
+import theano
+from theano import Op, config, tensor
+from theano.gof import COp
+from theano.gpuarray import GpuArrayType
+from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
+from .type import gpu_context_type
 try:
    import pygpu
    from pygpu.basic import triu, tril
@@ -94,13 +99,13 @@ class GpuCusolverSolve(Op):
                               'GpuCusolverSolve Op can not be constructed.')
        if skcuda.__version__ <= '0.5.1':
            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
-        context_name = basic_ops.infer_context_name(inp1, inp2)
+        context_name = infer_context_name(inp1, inp2)
-        inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
+        inp1 = as_gpuarray_variable(inp1, context_name)
-        inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)
+        inp2 = as_gpuarray_variable(inp2, context_name)
-        inp1 = basic_ops.gpu_contiguous(inp1)
+        inp1 = gpu_contiguous(inp1)
-        inp2 = basic_ops.gpu_contiguous(inp2)
+        inp2 = gpu_contiguous(inp2)
        # this op can only operate on float32 matrices
        assert inp1.ndim == 2
@@ -260,11 +265,11 @@ class GpuCholesky(Op):
        if not pygpu_available:
            raise RuntimeError('Missing pygpu or triu/tril functions.'
                               'Install or update libgpuarray.')
-        context_name = basic_ops.infer_context_name(inp)
+        context_name = infer_context_name(inp)
-        inp = basic_ops.as_gpuarray_variable(inp, context_name)
+        inp = as_gpuarray_variable(inp, context_name)
-        inp = basic_ops.gpu_contiguous(inp)
+        inp = gpu_contiguous(inp)
        # this op can only operate on float32 matrices
        # because of current implementation of triu/tril.
@@ -341,3 +346,160 @@ class GpuCholesky(Op):
 def gpu_cholesky(A, lower=True):
    return GpuCholesky(lower)(A)
+class GpuMagmaSVD(COp):
+    """Computes the svd of a matrix :math:`A` using magma library.
+    """
+    __props__ = ('full_matrices', 'compute_uv')
+    params_type = gpu_context_type
+    def __init__(self, full_matrices=True, compute_uv=True):
+        self.full_matrices = full_matrices
+        self.compute_uv = compute_uv
+        COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')
+    def c_headers(self):
+        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
+                'gpuarray_helper.h', 'magma.h']
+    def c_header_dirs(self):
+        dirs = [os.path.dirname(__file__), pygpu.get_include()]
+        if config.magma.include_path:
+            dirs.append(config.magma.include_path)
+        return dirs
+    def c_libraries(self):
+        return ['magma']
+    def c_lib_dirs(self):
+        if config.magma.library_path:
+            return [config.magma.library_path]
+        return []
+    def make_node(self, A):
+        ctx_name = infer_context_name(A)
+        A = as_gpuarray_variable(A, ctx_name)
+        if A.ndim != 2:
+            raise LinAlgError("Matrix rank error")
+        if self.compute_uv:
+            return theano.Apply(self, [A],
+                                [A.type(),
+                                GpuArrayType(A.dtype, broadcastable=[False],
+                                             context_name=ctx_name)(),
+                                A.type()])
+        else:
+            return theano.Apply(self, [A],
+                                [GpuArrayType(A.dtype, broadcastable=[False],
+                                              context_name=ctx_name)()])
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        params = []
+        if self.compute_uv:
+            params.append(('COMPUTE_UV', '1'))
+        if self.full_matrices:
+            params.append(('FULL_MATRICES', '1'))
+        return params
+    def infer_shape(self, node, shapes):
+        x_shape, = shapes
+        M, N = x_shape
+        K = tensor.minimum(M, N)
+        s_shape = (K, )
+        if self.compute_uv:
+            u_shape = (M, M) if self.full_matrices else (M, K)
+            vt_shape = (N, N) if self.full_matrices else (K, N)
+            return [u_shape, s_shape, vt_shape]
+        else:
+            return [s_shape]
+def gpu_svd(a, full_matrices=1, compute_uv=1):
+    """
+    This function performs the SVD on GPU.
+    Parameters
+    ----------
+    full_matrices : bool, optional
+        If True (default), u and v have the shapes (M, M) and (N, N),
+        respectively.
+        Otherwise, the shapes are (M, K) and (K, N), respectively,
+        where K = min(M, N).
+    compute_uv : bool, optional
+        Whether or not to compute u and v in addition to s.
+        True by default.
+    Returns
+    -------
+    U, V,  D : matrices
+    """
+    return GpuMagmaSVD(full_matrices, compute_uv)(a)
+class GpuMagmaMatrixInverse(COp):
+    """Computes the inverse of a matrix :math:`A` using magma library.
+    """
+    __props__ = ('inplace', )
+    params_type = gpu_context_type
+    def __init__(self, inplace=False):
+        COp.__init__(self, ['magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
+    def c_headers(self):
+        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
+                'gpuarray_helper.h', 'magma.h']
+    def c_header_dirs(self):
+        dirs = [os.path.dirname(__file__), pygpu.get_include()]
+        if config.magma.include_path:
+            dirs.append(config.magma.include_path)
+        return dirs
+    def c_libraries(self):
+        return ['magma']
+    def c_lib_dirs(self):
+        if config.magma.library_path:
+            return [config.magma.library_path]
+        return []
+    def clone_inplace(self):
+        return self.__class__(inplace=True)
+    def make_node(self, x):
+        ctx_name = infer_context_name(x)
+        x = as_gpuarray_variable(x, ctx_name)
+        if x.ndim != 2:
+            raise LinAlgError("Matrix rank error")
+        return theano.Apply(self, [x], [x.type()])
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        if self.inplace:
+            return [('INPLACE', '1')]
+        else:
+            return []
+    def infer_shape(self, node, shapes):
+        return shapes
+def gpu_matrix_inverse(a):
+    """
+    This function performs the matrix inverse on GPU.
+    Returns
+    -------
+    a_inv: matrix
+    """
+    return GpuMagmaMatrixInverse()(a)
--- a/theano/gpuarray/magma_inv.c
+++ b/theano/gpuarray/magma_inv.c
+#section init_code
+setup_ext_cuda();
+#section support_code_struct
+int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **A_inv,
+                              PyGpuContextObject *c) {
+  const size_t *dims;
+  magma_int_t N, ldwork, info;
+  magma_int_t *piv = NULL;
+  gpudata *dwork = NULL;
+  int res = -1;
+  if (A->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaMatrixInverse: Unsupported data type");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  magma_init();
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
+    return 1;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: matrix rank error");
+    goto fail;
+  }
+  dims = PyGpuArray_DIMS(A);
+  if (dims[0] != dims[1]) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: matrix is not square");
+    goto fail;
+  }
+#ifdef INPLACE
+  Py_XDECREF(*A_inv);
+  *A_inv = A;
+  Py_INCREF(*A_inv);
+#else
+  *A_inv = theano_try_copy(*A_inv, A);
+  if (*A_inv == NULL) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "GpuMagmaMatrixInverse: failed to allocate memory for the output");
+    goto fail;
+  }
+#endif
+  // magma matrix inverse
+  N = dims[0];
+  ldwork = N * magma_get_sgetri_nb(N);
+  dwork = gpudata_alloc(c->ctx, ldwork * sizeof(float), NULL, 0, NULL);
+  if (dwork == NULL) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaMatrixInverse: failed to allocate working memory");
+    goto fail;
+  }
+  if (magma_imalloc_cpu(&piv, N)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "GpuMagmaMatrixInverse: failed to allocate memory for the pivot array");
+    goto fail;
+  }
+  magma_sgetrf_gpu(N, N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv, &info);
+  if (info != 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaMatrixInverse: magma_sgetrf_gpu returned error %d: %s.", info,
+        magma_strerror(info));
+    goto fail;
+  }
+  magma_sgetri_gpu(N, (float *)PyGpuArray_DEV_DATA(*A_inv), N, piv,
+                   *(float **)dwork, ldwork, &info);
+  if (info != 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaMatrixInverse: magma_sgetri_gpu returned error %d: %s.", info,
+        magma_strerror(info));
+    goto fail;
+  }
+  res = 0;
+fail:
+  if (piv != NULL)
+    magma_free(piv);
+  if (dwork != NULL)
+    gpudata_release(dwork);
+  magma_finalize();
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/magma_svd.c
+++ b/theano/gpuarray/magma_svd.c
+#section init_code
+setup_ext_cuda();
+#section support_code_struct
+int APPLY_SPECIFIC(magma_svd)(PyGpuArrayObject *A,
+#ifdef COMPUTE_UV
+                              PyGpuArrayObject **U,
+#endif
+                              PyGpuArrayObject **S,
+#ifdef COMPUTE_UV
+                              PyGpuArrayObject **VT,
+#endif
+                              PyGpuContextObject *c) {
+  magma_int_t *iwork = NULL, iunused[1];
+  magma_int_t M, N, K, ldu, ldv, M_U, N_VT, info;
+  magma_vec_t jobz;
+  size_t s_dims[1], u_dims[2], vt_dims[2];
+  float *a_data = NULL, *s_data = NULL, *u_data = NULL, *vt_data = NULL,
+        *work = NULL;
+  float dummy[1];
+  int res = -1, lwork;
+  if (A->ga.typecode != GA_FLOAT) {
+    PyErr_SetString(PyExc_TypeError,
+                    "GpuMagmaMatrixInverse: Unsupported data type");
+    return -1;
+  }
+  // This is early to match the exit() in the fail label.
+  cuda_enter(c->ctx);
+  magma_init();
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
+    return 1;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: matrix rank error");
+    goto fail;
+  }
+  // magma matrix svd
+  // reverse dimensions because MAGMA expects column-major matrices:
+  M = PyGpuArray_DIM(A, 1);
+  N = PyGpuArray_DIM(A, 0);
+  K = std::min(M, N);
+  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&a_data, M * N)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  cudaMemcpy(a_data, PyGpuArray_DEV_DATA(A), M * N * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+  if (MAGMA_SUCCESS !=  magma_smalloc_pinned(&s_data, K)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+#ifdef COMPUTE_UV
+#ifdef FULL_MATRICES
+  jobz = MagmaAllVec;
+#else
+  jobz = MagmaSomeVec;
+#endif
+  M_U  = (jobz == MagmaAllVec ? M : K);
+  N_VT = (jobz == MagmaAllVec ? N : K);
+  ldu = M;
+  ldv = N_VT;
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&u_data, M_U * M)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&vt_data, N * N_VT)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+#else
+  jobz = MagmaNoVec;
+  ldu = M;
+  ldv = N;
+#endif
+  // query for workspace size
+  magma_sgesdd(jobz, M, N, NULL, M, NULL, NULL, ldu, NULL, ldv,
+               dummy, -1, iunused, &info);
+  lwork = (magma_int_t) MAGMA_S_REAL(dummy[0]);
+  if (MAGMA_SUCCESS != magma_smalloc_pinned(&work, lwork)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate working memory");
+    goto fail;
+  }
+  if (MAGMA_SUCCESS != magma_imalloc_cpu(&iwork, 8*K)) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate working memory");
+    goto fail;
+  }
+  // compute svd
+  magma_sgesdd(jobz, M, N, a_data, M, s_data,
+               u_data, ldu, vt_data, ldv, work, lwork, iwork, &info);
+  if (info > 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaSVD: the updating process of SBDSDC did not converge (error: %d)",
+        info);
+    goto fail;
+  } else if (info < 0) {
+    PyErr_Format(
+        PyExc_RuntimeError,
+        "GpuMagmaSVD: magma_sgesdd_gpu argument %d has an illegal value", -info);
+    goto fail;
+  }
+  s_dims[0] = K;
+  if (theano_prep_output(S, 1, s_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  cudaMemcpy(PyGpuArray_DEV_DATA(*S), s_data, K * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+#ifdef COMPUTE_UV
+  u_dims[0] = N; u_dims[1] = N_VT;
+  if (theano_prep_output(U, 2, u_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+  // to match numpy.linalg.svd output
+  cudaMemcpy(PyGpuArray_DEV_DATA(*U), vt_data, N * N_VT * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+  vt_dims[0] = M_U; vt_dims[1] = M;
+  if (theano_prep_output(VT, 2, vt_dims, A->ga.typecode, GA_C_ORDER, c) != 0){
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaSVD: failed to allocate memory");
+    goto fail;
+  }
+  // magma expects column-major matrices. Exchange u_data -> VT and vt_data -> U
+  // to match numpy.linalg.svd output
+  cudaMemcpy(PyGpuArray_DEV_DATA(*VT), u_data, M_U * M * sizeof(float),
+             cudaMemcpyDeviceToDevice);
+#endif
+  res = 0;
+fail:
+  if (a_data != NULL)
+    magma_free_pinned(a_data);
+  if (s_data != NULL)
+    magma_free_pinned(s_data);
+  if (u_data != NULL)
+    magma_free_pinned(u_data);
+  if (vt_data != NULL)
+    magma_free_pinned(vt_data);
+  if (work != NULL)
+    magma_free_pinned(work);
+  if (iwork != NULL)
+    magma_free_cpu(iwork);
+  magma_finalize();
+  cuda_exit(c->ctx);
+  return res;
+}
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv3d,
                                              AbstractConv3d_gradWeights,
                                              AbstractConv3d_gradInputs)
+import theano.tensor.nlinalg as nlinalg
 import theano.tensor.signal.pool as pool
 import theano.tensor.slinalg as slinalg
@@ -71,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
-                     cusolver_available)
+                     cusolver_available, GpuMagmaMatrixInverse, GpuMagmaSVD)
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -2004,6 +2005,34 @@ def local_inplace_cholesky(node):
    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
        return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
+@register_opt('magma', 'fast_compile')
+@op_lifter([nlinalg.MatrixInverse])
+@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'magma', 'fast_compile')
+def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
+    if not config.magma.enabled:
+        return
+    return GpuMagmaMatrixInverse()
+@register_inplace()
+@local_optimizer([GpuMagmaMatrixInverse])
+def local_inplace_matrix_inverse_inplace(node):
+    if isinstance(node.op, GpuMagmaMatrixInverse):
+        if not node.op.inplace:
+            return [node.op.clone_inplace()(*node.inputs)]
+@register_opt('magma', 'fast_compile')
+@op_lifter([nlinalg.SVD])
+@register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')
+def local_gpu_svd(op, context_name, inputs, outputs):
+    if not config.magma.enabled:
+        return
+    return GpuMagmaSVD(full_matrices=op.full_matrices,
+                       compute_uv=op.compute_uv)
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
 from __future__ import absolute_import, division, print_function
 import unittest
 import numpy as np
-import theano
+from numpy.linalg.linalg import LinAlgError
+import theano
+from theano import config
+from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse,
+                                    cusolver_available, gpu_matrix_inverse,
+                                    gpu_solve, gpu_svd)
+from theano.tensor.nlinalg import matrix_inverse
 from theano.tests import unittest_tools as utt
-from .config import mode_with_gpu
-from numpy.linalg.linalg import LinAlgError
+from .. import gpuarray_shared_constructor
+from .config import mode_with_gpu, mode_without_gpu
+from .test_basic_ops import rand
-# Skip tests if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
-from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)
-if not cusolver_available:
+class TestCusolver(unittest.TestCase):
-    raise SkipTest('Optional package scikits.cuda.cusolver not available')
+    def setUp(self):
+        if not cusolver_available:
+            self.skipTest('Optional package scikits.cuda.cusolver not available')
-class TestCusolver(unittest.TestCase):
    def run_gpu_solve(self, A_val, x_val, A_struct=None):
        b_val = np.dot(A_val, x_val)
        b_val_trans = np.dot(A_val.T, x_val)
@@ -117,6 +123,8 @@ class TestCusolver(unittest.TestCase):
 class TestGpuCholesky(unittest.TestCase):
    def setUp(self):
+        if not cusolver_available:
+            self.skipTest('Optional package scikits.cuda.cusolver not available')
        utt.seed_rng()
    def get_gpu_cholesky_func(self, lower=True, inplace=False):
@@ -191,3 +199,97 @@ class TestGpuCholesky(unittest.TestCase):
        A_val = -M_val.dot(M_val.T)
        fn = self.get_gpu_cholesky_func(True, False)
        self.assertRaises(LinAlgError, fn, A_val)
+class TestMagma(unittest.TestCase):
+    def setUp(self):
+        if not config.magma.enabled:
+            self.skipTest('Magma is not enabled, skipping test')
+    def test_gpu_matrix_inverse(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
+        N = 1000
+        A_val = rand(N, N).astype('float32')
+        A_val_inv = fn(A_val)
+        utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
+    def test_gpu_matrix_inverse_inplace(self):
+        N = 1000
+        A_val_gpu = gpuarray_shared_constructor(rand(N, N).astype('float32'))
+        A_val_copy = A_val_gpu.get_value()
+        fn = theano.function([], GpuMagmaMatrixInverse(inplace=True)(A_val_gpu),
+                             mode=mode_with_gpu, accept_inplace=True)
+        fn()
+        utt.assert_allclose(np.dot(A_val_gpu.get_value(), A_val_copy), np.eye(N), atol=1e-3)
+    def test_gpu_matrix_inverse_inplace_opt(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], matrix_inverse(A), mode=mode_with_gpu)
+        assert any([
+            node.op.inplace
+            for node in fn.maker.fgraph.toposort() if
+            isinstance(node.op, GpuMagmaMatrixInverse)
+        ])
+    def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
+        A = theano.tensor.fmatrix("A")
+        f = theano.function(
+            [A], gpu_svd(A, full_matrices=full_matrices, compute_uv=compute_uv),
+            mode=mode_with_gpu)
+        return f(A_val)
+    def assert_column_orthonormal(self, Ot):
+        utt.assert_allclose(np.dot(Ot.T, Ot), np.eye(Ot.shape[1]))
+    def check_svd(self, A, U, S, VT, rtol=None, atol=None):
+        S_m = np.zeros_like(A)
+        np.fill_diagonal(S_m, S)
+        utt.assert_allclose(
+            np.dot(np.dot(U, S_m), VT), A, rtol=rtol, atol=atol)
+    def test_gpu_svd_wide(self):
+        A = rand(100, 50).astype('float32')
+        M, N = A.shape
+        U, S, VT = self.run_gpu_svd(A)
+        self.assert_column_orthonormal(U)
+        self.assert_column_orthonormal(VT.T)
+        self.check_svd(A, U, S, VT)
+        U, S, VT = self.run_gpu_svd(A, full_matrices=False)
+        self.assertEqual(U.shape[1], min(M, N))
+        self.assert_column_orthonormal(U)
+        self.assertEqual(VT.shape[0], min(M, N))
+        self.assert_column_orthonormal(VT.T)
+    def test_gpu_svd_tall(self):
+        A = rand(50, 100).astype('float32')
+        M, N = A.shape
+        U, S, VT = self.run_gpu_svd(A)
+        self.assert_column_orthonormal(U)
+        self.assert_column_orthonormal(VT.T)
+        self.check_svd(A, U, S, VT)
+        U, S, VT = self.run_gpu_svd(A, full_matrices=False)
+        self.assertEqual(U.shape[1], min(M, N))
+        self.assert_column_orthonormal(U)
+        self.assertEqual(VT.shape[0], min(M, N))
+        self.assert_column_orthonormal(VT.T)
+    def test_gpu_singular_values(self):
+        A = theano.tensor.fmatrix("A")
+        f_cpu = theano.function(
+            [A], theano.tensor.nlinalg.svd(A, compute_uv=False),
+            mode=mode_without_gpu)
+        f_gpu = theano.function(
+            [A], gpu_svd(A, compute_uv=False), mode=mode_with_gpu)
+        A_val = rand(50, 100).astype('float32')
+        utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
+        A_val = rand(100, 50).astype('float32')
+        utt.assert_allclose(f_cpu(A_val), f_gpu(A_val))
--- a/theano/tensor/nlinalg.py
+++ b/theano/tensor/nlinalg.py
@@ -616,18 +616,37 @@ class SVD(Op):
    def make_node(self, x):
        x = as_tensor_variable(x)
        assert x.ndim == 2, "The input of svd function should be a matrix."
-        w = theano.tensor.matrix(dtype=x.dtype)
+        s = theano.tensor.vector(dtype=x.dtype)
-        u = theano.tensor.vector(dtype=x.dtype)
+        if self.compute_uv:
-        v = theano.tensor.matrix(dtype=x.dtype)
+            u = theano.tensor.matrix(dtype=x.dtype)
-        return Apply(self, [x], [w, u, v])
+            vt = theano.tensor.matrix(dtype=x.dtype)
+            return Apply(self, [x], [u, s, vt])
+        else:
+            return Apply(self, [x], [s])
    def perform(self, node, inputs, outputs):
        (x,) = inputs
-        (w, u, v) = outputs
        assert x.ndim == 2, "The input of svd function should be a matrix."
-        w[0], u[0], v[0] = self._numop(x,
+        if self.compute_uv:
-                                       self.full_matrices,
+            u, s, vt = outputs
-                                       self.compute_uv)
+            u[0], s[0], vt[0] = self._numop(x,
+                                            self.full_matrices,
+                                            self.compute_uv)
+        else:
+            s, = outputs
+            s[0] = self._numop(x, self.full_matrices, self.compute_uv)
+    def infer_shape(self, node, shapes):
+        x_shape, = shapes
+        M, N = x_shape
+        K = tensor.minimum(M, N)
+        s_shape = (K, )
+        if self.compute_uv:
+            u_shape = (M, M) if self.full_matrices else (M, K)
+            vt_shape = (N, N) if self.full_matrices else (K, N)
+            return [u_shape, s_shape, vt_shape]
+        else:
+            return [s_shape]
 def svd(a, full_matrices=1, compute_uv=1):

--- a/theano/tensor/tests/test_nlinalg.py
+++ b/theano/tensor/tests/test_nlinalg.py
 from __future__ import absolute_import, print_function, division
 import unittest
+import itertools
 import numpy as np
 import numpy.linalg
 from numpy.testing import assert_array_almost_equal
@@ -20,7 +21,7 @@ from theano.tensor.nlinalg import (
    AllocDiag, alloc_diag, ExtractDiag, extract_diag, diag,
    trace, Det, det, Eig, eig, Eigh, EighGrad, eigh,
    matrix_dot, _zero_disconnected, qr, matrix_power,
-    norm, svd, TensorInv, tensorinv, tensorsolve)
+    norm, svd, SVD, TensorInv, tensorinv, tensorsolve)
 from nose.plugins.attrib import attr
 from nose.plugins.skip import SkipTest
@@ -128,18 +129,46 @@ def test_qr_modes():
        assert "name 'complete' is not defined" in str(e)
-def test_svd():
+class test_SVD(utt.InferShapeTester):
-    rng = np.random.RandomState(utt.fetch_seed())
+    op_class = SVD
-    A = tensor.matrix("A", dtype=theano.config.floatX)
+    dtype = 'float32'
-    U, V, T = svd(A)
-    fn = function([A], [U, V, T])
-    a = rng.rand(4, 4).astype(theano.config.floatX)
-    n_u, n_v, n_t = np.linalg.svd(a)
-    t_u, t_v, t_t = fn(a)
-    assert _allclose(n_u, t_u)
+    def setUp(self):
-    assert _allclose(n_v, t_v)
+        super(test_SVD, self).setUp()
-    assert _allclose(n_t, t_t)
+        self.rng = np.random.RandomState(utt.fetch_seed())
+        self.A = theano.tensor.matrix(dtype=self.dtype)
+        self.op = svd
+    def test_svd(self):
+        A = tensor.matrix("A", dtype=self.dtype)
+        U, S, VT = svd(A)
+        fn = function([A], [U, S, VT])
+        a = self.rng.rand(4, 4).astype(self.dtype)
+        n_u, n_s, n_vt = np.linalg.svd(a)
+        t_u, t_s, t_vt = fn(a)
+        assert _allclose(n_u, t_u)
+        assert _allclose(n_s, t_s)
+        assert _allclose(n_vt, t_vt)
+        fn = function([A], svd(A, compute_uv=False))
+        t_s = fn(a)
+        assert _allclose(n_s, t_s)
+    def test_svd_infer_shape(self):
+        self.validate_shape((4, 4), full_matrices=True, compute_uv=True)
+        self.validate_shape((4, 4), full_matrices=False, compute_uv=True)
+        self.validate_shape((2, 4), full_matrices=False, compute_uv=True)
+        self.validate_shape((4, 2), full_matrices=False, compute_uv=True)
+        self.validate_shape((4, 4), compute_uv=False)
+    def validate_shape(self, shape, compute_uv=True, full_matrices=True):
+        A = self.A
+        A_v = self.rng.rand(*shape).astype(self.dtype)
+        outputs = self.op(A, full_matrices=full_matrices, compute_uv=compute_uv)
+        if not compute_uv:
+            outputs = [outputs]
+        self._compile_and_check([A], outputs, [A_v], self.op_class, warn=False)
 def test_tensorsolve():
@@ -403,8 +432,7 @@ class test_Eig(utt.InferShapeTester):
        super(test_Eig, self).setUp()
        self.rng = np.random.RandomState(utt.fetch_seed())
        self.A = theano.tensor.matrix(dtype=self.dtype)
-        self.X = np.asarray(self.rng.rand(5, 5),
+        self.X = np.asarray(self.rng.rand(5, 5), dtype=self.dtype)
-                               dtype=self.dtype)
        self.S = self.X.dot(self.X.T)
    def test_infer_shape(self):