Add magma matrix inverse and lifting optimization

4cfa236d · Alexander Matyasko · 91835cc8 · 4cfa236d · 4cfa236d · 4cfa236d
--- a/theano/gpuarray/linalg.py
+++ b/theano/gpuarray/linalg.py
 from __future__ import absolute_import, division, print_function
-import pkg_resources
+import os
-import theano
 import warnings
-from theano import Op
-from theano.gpuarray import basic_ops, GpuArrayType
 import numpy as np
+import pkg_resources
 from numpy.linalg.linalg import LinAlgError
+import theano
+from theano import Op
+from theano.gof import COp
+from theano.gpuarray import GpuArrayType
+from .basic_ops import as_gpuarray_variable, gpu_contiguous, infer_context_name
+from .type import gpu_context_type
 try:
    import pygpu
    from pygpu.basic import triu, tril
@@ -94,13 +99,13 @@ class GpuCusolverSolve(Op):
                               'GpuCusolverSolve Op can not be constructed.')
        if skcuda.__version__ <= '0.5.1':
            warnings.warn('The GpuSolve op requires scikit-cuda > 0.5.1 to work with CUDA 8')
-        context_name = basic_ops.infer_context_name(inp1, inp2)
+        context_name = infer_context_name(inp1, inp2)
-        inp1 = basic_ops.as_gpuarray_variable(inp1, context_name)
+        inp1 = as_gpuarray_variable(inp1, context_name)
-        inp2 = basic_ops.as_gpuarray_variable(inp2, context_name)
+        inp2 = as_gpuarray_variable(inp2, context_name)
-        inp1 = basic_ops.gpu_contiguous(inp1)
+        inp1 = gpu_contiguous(inp1)
-        inp2 = basic_ops.gpu_contiguous(inp2)
+        inp2 = gpu_contiguous(inp2)
        # this op can only operate on float32 matrices
        assert inp1.ndim == 2
@@ -260,11 +265,11 @@ class GpuCholesky(Op):
        if not pygpu_available:
            raise RuntimeError('Missing pygpu or triu/tril functions.'
                               'Install or update libgpuarray.')
-        context_name = basic_ops.infer_context_name(inp)
+        context_name = infer_context_name(inp)
-        inp = basic_ops.as_gpuarray_variable(inp, context_name)
+        inp = as_gpuarray_variable(inp, context_name)
-        inp = basic_ops.gpu_contiguous(inp)
+        inp = gpu_contiguous(inp)
        # this op can only operate on float32 matrices
        # because of current implementation of triu/tril.
@@ -341,3 +346,46 @@ class GpuCholesky(Op):
 def gpu_cholesky(A, lower=True):
    return GpuCholesky(lower)(A)
+class GpuMagmaMatrixInverse(COp):
+    """Computes the inverse of a matrix :math:`A` using magma library.
+    """
+    __props__ = ('inplace', )
+    params_type = gpu_context_type
+    def __init__(self, inplace=False):
+        COp.__init__(self, ['magma_linalg.c'],
+                     'APPLY_SPECIFIC(magma_matrix_inv)')
+        self.inplace = inplace
+    def c_headers(self):
+        return ['gpuarray/array.h', 'gpuarray/blas.h', 'gpuarray_helper.h', 'magma.h']
+    def c_header_dirs(self):
+        return [os.path.dirname(__file__), pygpu.get_include()]
+    def c_libraries(self):
+        return ['magma']
+    def make_node(self, x):
+        if x.ndim != 2:
+            raise LinAlgError("Matrix rank error")
+        context_name = infer_context_name(x)
+        x = as_gpuarray_variable(x, context_name)
+        return theano.Apply(self, [x], [x.type()])
+    def get_params(self, node):
+        return node.inputs[0].type.context
+    def get_op_params(self):
+        if self.inplace:
+            return [('INPLACE', '1')]
+        else:
+            return []
+    def infer_shape(self, node, shapes):
+        return shapes
+gpu_matrix_inverse = GpuMagmaMatrixInverse()
--- a/theano/gpuarray/magma_linalg.c
+++ b/theano/gpuarray/magma_linalg.c
+#section support_code_struct
+float *APPLY_SPECIFIC(dwork);
+magma_int_t *APPLY_SPECIFIC(piv);
+#section init_code_struct
+APPLY_SPECIFIC(dwork) = NULL;
+APPLY_SPECIFIC(piv) = NULL;
+#section cleanup_code_struct
+if (APPLY_SPECIFIC(dwork) != NULL) {magma_free(APPLY_SPECIFIC(dwork));}
+if (APPLY_SPECIFIC(piv) != NULL) {magma_free(APPLY_SPECIFIC(piv));}
+#section support_code_struct
+int APPLY_SPECIFIC(magma_matrix_inv)(PyGpuArrayObject *A,
+                                     PyGpuArrayObject **_A_inv,
+                                     PyGpuContextObject *ctx) {
+  PyGpuArrayObject *A_inv = *_A_inv;
+  if (!GpuArray_IS_C_CONTIGUOUS(&A->ga)) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: requires data to be C-contiguous");
+    return 1;
+  }
+  if (PyGpuArray_NDIM(A) != 2) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: matrix rank error");
+    return 1;
+  }
+  const size_t *x_dims = PyGpuArray_DIMS(A);
+  if (x_dims[0] != x_dims[1]) {
+    PyErr_SetString(PyExc_ValueError,
+                    "GpuMagmaMatrixInverse: matrix is not square");
+    return 1;
+  }
+#ifdef INPLACE
+  Py_XDECREF(out);
+  A_inv = A;
+  Py_INCREF(out);
+#else
+  A_inv = theano_try_copy(A_inv, A);
+  if (A_inv == NULL) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "GpuMagmaMatrixInverse: failed to allocate memory");
+    return 1;
+  }
+#endif
+  {
+    // magma matrix inverse
+    magma_init();
+    magma_int_t ldwork, info;
+    magma_int_t N = x_dims[0];
+    ldwork = N * magma_get_sgetri_nb(N);
+    if (magma_smalloc(&APPLY_SPECIFIC(dwork), ldwork)) {
+      PyErr_SetString(
+          PyExc_RuntimeError,
+          "GpuMagmaMatrixInverse: failed to allocate magma working memory");
+      return 1;
+    }
+    APPLY_SPECIFIC(piv) = (magma_int_t *)malloc(N * sizeof(magma_int_t));
+    float *A_ptr = (float *) ((void **)A_inv->ga.data)[0];
+    magma_sgetri_gpu(N, A_ptr, N, APPLY_SPECIFIC(piv), APPLY_SPECIFIC(dwork), ldwork, &info);
+    if (info != 0) {
+      PyErr_Format(
+          PyExc_RuntimeError,
+          "GpuMagmaMatrixInverse: magma_sgetri_gpu returned error %d", info);
+      return 1;
+    }
+    magma_finalize();
+  }
+  *_A_inv = A_inv;
+  return 0;
+}
--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -32,6 +32,7 @@ from theano.tensor.nnet.abstract_conv import (BaseAbstractConv,
                                              AbstractConv3d,
                                              AbstractConv3d_gradWeights,
                                              AbstractConv3d_gradInputs)
+import theano.tensor.nlinalg as nlinalg
 import theano.tensor.signal.pool as pool
 import theano.tensor.slinalg as slinalg
@@ -71,7 +72,7 @@ from .subtensor import (GpuIncSubtensor, GpuSubtensor,
 from .opt_util import alpha_merge, output_merge, pad_dims, unpad_dims
 from .reduction import GpuMaxAndArgmax
 from .linalg import (GpuCusolverSolve, MATRIX_STRUCTURES_SOLVE, GpuCholesky,
-                     cusolver_available)
+                     cusolver_available, GpuMagmaMatrixInverse)
 _logger = logging.getLogger("theano.gpuarray.opt")
@@ -2004,6 +2005,17 @@ def local_inplace_cholesky(node):
    if isinstance(node.op, GpuCholesky) and not node.op.inplace:
        return [GpuCholesky(lower=node.op.lower, inplace=True)(*node.inputs)]
+@register_opt('fast_compile')
+@op_lifter([nlinalg.MatrixInverse])
+@register_opt2([theano.tensor.nlinalg.MatrixInverse], 'fast_compile')
+def local_gpu_matrix_inverse(op, context_name, inputs, outputs):
+    magma_available = True
+    if not magma_available:
+        return
+    return GpuMagmaMatrixInverse()
 # Do not register in fast_run or fast_compile.
 # It will be added to fast_run if the GPU is enabled.
 optdb.register('gpua_scanOp_make_inplace',

--- a/theano/gpuarray/tests/test_linalg.py
+++ b/theano/gpuarray/tests/test_linalg.py
@@ -10,14 +10,16 @@ from .config import mode_with_gpu
 from numpy.linalg.linalg import LinAlgError
 # Skip tests if cuda_ndarray is not available.
-from nose.plugins.skip import SkipTest
+from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky,
-from theano.gpuarray.linalg import (cusolver_available, gpu_solve, GpuCholesky)
+                                    gpu_matrix_inverse)
-if not cusolver_available:
-    raise SkipTest('Optional package scikits.cuda.cusolver not available')
 class TestCusolver(unittest.TestCase):
+    def setUp(self):
+        if not cusolver_available:
+            self.skipTest('Optional package scikits.cuda.cusolver not available')
    def run_gpu_solve(self, A_val, x_val, A_struct=None):
        b_val = np.dot(A_val, x_val)
        b_val_trans = np.dot(A_val.T, x_val)
@@ -117,6 +119,8 @@ class TestCusolver(unittest.TestCase):
 class TestGpuCholesky(unittest.TestCase):
    def setUp(self):
+        if not cusolver_available:
+            self.skipTest('Optional package scikits.cuda.cusolver not available')
        utt.seed_rng()
    def get_gpu_cholesky_func(self, lower=True, inplace=False):
@@ -191,3 +195,14 @@ class TestGpuCholesky(unittest.TestCase):
        A_val = -M_val.dot(M_val.T)
        fn = self.get_gpu_cholesky_func(True, False)
        self.assertRaises(LinAlgError, fn, A_val)
+class TestMagma(unittest.TestCase):
+    def test_gpu_matrix_inverse(self):
+        A = theano.tensor.fmatrix("A")
+        fn = theano.function([A], gpu_matrix_inverse(A), mode=mode_with_gpu)
+        N = 1000
+        A_val = np.random.rand(N, N).astype(np.float32)
+        A_val_inv = fn(A_val)
+        utt.assert_allclose(np.dot(A_val_inv, A_val), np.eye(N), atol=1e-3)