implementation cusolver solve general case + tests

90cd2169 · Thomas George · b8f3b638 · 90cd2169 · b8f3b638 · 90cd2169
--- a/theano/gpuarray/linalg/solve.py
+++ b/theano/gpuarray/linalg/solve.py
@@ -10,7 +10,7 @@ from pygpu import gpuarray

 cusolver_available = False
 try:
-    from scikits.cuda import cusolver
+    from skcuda import cusolver
    cusolver_available = True
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass
@@ -30,8 +30,11 @@ class GpuCusolverSolve(Op):

    __props__ = ('trans',)

-    def __init__(self, trans='N'):
+    def __init__(self, trans='N', inplace=False):
        self.trans = trans
+        self.inplace = inplace
+        if self.inplace:
+            self.destroy_map = {0: [0, 1]}
        super(GpuCusolverSolve, self).__init__()

    def make_node(self, inp1, inp2):
@@ -52,7 +55,7 @@ class GpuCusolverSolve(Op):
        return theano.Apply(
            self, [inp1, inp2],
            [GpuArrayType('float32',
-                          broadcastable=[False] * inp1.ndim,
+                          broadcastable=inp1.broadcastable,
                          context_name=self.context)()])

    def make_thunk(self,
@@ -86,14 +89,12 @@ class GpuCusolverSolve(Op):
            assert(len(A.shape) == 2)
            assert(len(b.shape) == 2)

-            # A is not explicitly converted between C and F order, instead we
-            # switch the "transpose" flag.
            if self.trans in ['T', 'C']:
-                trans = 0
+                trans = 1
                l, n = A.shape
                k, m = b.shape
            elif self.trans == 'N':
-                trans = 1
+                trans = 0
                n, l = A.shape
                k, m = b.shape
            else:
@@ -106,13 +107,18 @@ class GpuCusolverSolve(Op):
            lda = max(1, n)
            ldb = max(1, k, m)

-            if trans == 0:
-                A = gpuarray.asfortranarray(A)
-                trans = 1
+            # We copy A and b as cusolver operates inplace
+            if not self.inplace:
+                A = gpuarray.array(A, copy=True)
+                b = gpuarray.array(b, copy=True, order='F')
            A_ptr = A.gpudata
-            # We copy b as cusolver operates inplace
-            b_cpy = gpuarray.array(b, copy=True, order='F')
-            b_ptr = b_cpy.gpudata
+            b_ptr = b.gpudata
+
+            # cusolver expects a F ordered matrix, but A is not explicitly
+            # converted between C and F order, instead we switch the
+            # "transpose" flag.
+            if A.flags['C_CONTIGUOUS']:
+                trans = 1 - trans

            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
                cusolver_handle, n, n, A_ptr, lda)
@@ -145,7 +151,7 @@ class GpuCusolverSolve(Op):
                cusolver_handle, trans, n, m, A_ptr, lda,
                pivots_ptr, b_ptr, ldb, dev_info_ptr)

-            z[0] = b_cpy
+            z[0] = b

        thunk.inputs = inputs
        thunk.outputs = outputs

--- a/theano/gpuarray/linalg/__init__.py
+++ b/theano/gpuarray/linalg/__init__.py
--- a/theano/sandbox/cuda/tests/test_cusolver.py
+++ b/theano/sandbox/cuda/tests/test_cusolver.py
@@ -5,36 +5,27 @@ import numpy
 import theano

 from theano.tests import unittest_tools as utt
+from .config import mode_with_gpu

 # Skip tests if cuda_ndarray is not available.
 from nose.plugins.skip import SkipTest
-import theano.sandbox.cuda as cuda_ndarray
-from theano.misc.pycuda_init import pycuda_available
-from theano.sandbox.cuda.cusolver import cusolver_available
+from theano.gpuarray.linalg import (cusolver_available, gpu_solve)

-from theano.sandbox.cuda import cusolver
-
-if not cuda_ndarray.cuda_available:
-    raise SkipTest('Optional package cuda not available')
-if not pycuda_available:
-    raise SkipTest('Optional package pycuda not available')
 if not cusolver_available:
    raise SkipTest('Optional package scikits.cuda.cusolver not available')

-if theano.config.mode == 'FAST_COMPILE':
-    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-else:
-    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-

 class TestCusolver(unittest.TestCase):
-    def run_gpu_solve(self, A_val, x_val):
-        b_val = numpy.dot(A_val, x_val)
+    def run_gpu_solve(self, A_val, x_val, trans='N'):
+        if trans == 'N':
+            b_val = numpy.dot(A_val, x_val)
+        else:
+            b_val = numpy.dot(A_val.T, x_val)
        A = theano.tensor.matrix("A", dtype="float32")
        b = theano.tensor.matrix("b", dtype="float32")

-        solver = cusolver.gpu_solve(A, b)
-        fn = theano.function([A, b], [solver])
+        solver = gpu_solve(A, b, trans)
+        fn = theano.function([A, b], [solver], mode=mode_with_gpu)
        res = fn(A_val, b_val)
        x_res = numpy.array(res[0])
        utt.assert_allclose(x_res, x_val)
@@ -69,3 +60,10 @@ class TestCusolver(unittest.TestCase):
        x_val = numpy.random.uniform(-0.4, 0.4,
                                     (A_val.shape[1], 4)).astype("float32")
        self.run_gpu_solve(A_val, x_val)
+
+    def test_uni_rand_solve_transpose(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        self.run_gpu_solve(A_val, x_val, trans='T')