Fix inplace matrix inverse and add tests for it

b7cd3ce3 · Alexander Matyasko · 3feae315 · b7cd3ce3 · b7cd3ce3 · b7cd3ce3
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
@@ -357,8 +357,7 @@ class GpuMagmaSVD(COp):
    def __init__(self, full_matrices=True, compute_uv=True):
        self.full_matrices = full_matrices
        self.compute_uv = compute_uv
-        COp.__init__(self, ['magma_svd.c'],
+        COp.__init__(self, ['magma_svd.c'], 'APPLY_SPECIFIC(magma_svd)')
-                     'APPLY_SPECIFIC(magma_svd)')
    def c_headers(self):
        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
@@ -429,9 +428,10 @@ class GpuMagmaMatrixInverse(COp):
    params_type = gpu_context_type
    def __init__(self, inplace=False):
-        COp.__init__(self, ['magma_inv.c'],
+        COp.__init__(self, ['magma_inv.c'], 'APPLY_SPECIFIC(magma_inv)')
-                     'APPLY_SPECIFIC(magma_inv)')
        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0]}
    def c_headers(self):
        return ['gpuarray/types.h', 'gpuarray/array.h', 'gpuarray/ext_cuda.h',
@@ -451,6 +451,9 @@ class GpuMagmaMatrixInverse(COp):
            return [config.magma.library_path]
        return []
+    def clone_inplace(self):
+        return self.__class__(inplace=True)
    def make_node(self, x):
        ctx_name = infer_context_name(x)
        x = as_gpuarray_variable(x, ctx_name)
@@ -471,4 +474,18 @@ class GpuMagmaMatrixInverse(COp):
        return shapes
-gpu_matrix_inverse = GpuMagmaMatrixInverse()
+def gpu_matrix_inverse(a, inplace=False):
+    """
+    This function performs the matrix inverse on GPU.
+    Parameters
+    ----------
+    inplace : bool, optional
+        Whether or not to compute matrix inverse inplace.
+    Returns
+    -------
+    a_inv: matrix
+    """
+    return GpuMagmaMatrixInverse(inplace=inplace)(a)
--- a/theano/gpuarray/magma_inv.c
+++ b/theano/gpuarray/magma_inv.c
@@ -40,9 +40,9 @@ int APPLY_SPECIFIC(magma_inv)(PyGpuArrayObject *A, PyGpuArrayObject **_A_inv,
    goto fail;
  }
 #ifdef INPLACE
-  Py_XDECREF(out);
+  Py_XDECREF(A_inv);
  A_inv = A;
-  Py_INCREF(out);
+  Py_INCREF(A_inv);
 #else
  A_inv = theano_try_copy(A_inv, A);
  if (A_inv == NULL) {

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -2015,6 +2015,14 @@ def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
    return GpuMagmaMatrixInverse()
+@register_inplace()
+@local_optimizer([GpuMagmaMatrixInverse])
+def local_inplace_matrix_inverse_inplace(node):
+    if isinstance(node.op, GpuMagmaMatrixInverse):
+        if not node.op.inplace:
+            return [node.op.clone_inplace()(*node.inputs)]
 @register_opt('magma', 'fast_compile')
 @op_lifter([nlinalg.SVD])
 @register_opt2([theano.tensor.nlinalg.SVD], 'magma', 'fast_compile')

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
 from __future__ import absolute_import, division, print_function
 import unittest
 import numpy as np
-import theano
+from numpy.linalg.linalg import LinAlgError
+import theano
+from theano import config
+from theano.gpuarray.linalg import (GpuCholesky, GpuMagmaMatrixInverse,
+                                    cusolver_available, gpu_matrix_inverse,
+                                    gpu_solve, gpu_svd)
+from theano.tensor.nlinalg import matrix_inverse
 from theano.tests import unittest_tools as utt
+from .. import gpuarray_shared_constructor
 from .config import mode_with_gpu, mode_without_gpu
 from .test_basic_ops import rand
-from numpy.linalg.linalg import LinAlgError
-from theano import config
-from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky,
-                                    gpu_matrix_inverse, gpu_svd)
 class TestCusolver(unittest.TestCase):
@@ -199,6 +202,7 @@ class TestGpuCholesky(unittest.TestCase):
 class TestMagma(unittest.TestCase):
    def setUp(self):
        if not config.magma.enabled:
            self.skipTest('Magma is not enabled, skipping test')
@@ -208,10 +212,30 @@ class TestMagma(unittest.TestCase):
        fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu.including('magma'))
        N = 1000
-        A_val = np.random.rand(N, N).astype(np.float32)
+        A_val = rand(N, N)
        A_val_inv = fn(A_val)
        utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)
+    def test_gpu_matrix_inverse_inplace(self):
+        N = 1000
+        A_val_gpu = gpuarray_shared_constructor(rand(N, N))
+        A_val_copy = A_val_gpu.get_value()
+        fn = theano.function([], gpu_matrix_inverse(A_val_gpu, inplace=True),
+                             mode=mode_with_gpu.including('magma'),
+                             accept_inplace=True)
+        fn()
+        utt.assert_allclose(np.dot(A_val_gpu.get_value(), A_val_copy), np.eye(N), atol=1e-3)
+    def test_gpu_matrix_inverse_inplace_opt(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], matrix_inverse(A),
+                             mode=mode_with_gpu.including('magma'))
+        assert any([
+            node.op.inplace
+            for node in fn.maker.fgraph.toposort() if
+            isinstance(node.op, GpuMagmaMatrixInverse)
+        ])
    def run_gpu_svd(self, A_val, full_matrices=True, compute_uv=True):
        A = theano.tensor.fmatrix("A")
        f = theano.function(