Addressed Fred's concerns

375bc8c6 · Yaroslav Ganin · Thomas George · 25fd044f · 375bc8c6 · 375bc8c6
--- a/theano/sandbox/cuda/cusolver.py
+++ b/theano/sandbox/cuda/cusolver.py
-from __future__ import absolute_import, print_function, division
-import pkg_resources
+from __future__ import absolute_import, division, print_function

+import pkg_resources
 import theano
-from theano.sandbox.cuda.type import CudaNdarrayType
+
 from theano.sandbox.cuda import CudaNdarray, GpuOp
 from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
+from theano.sandbox.cuda.type import CudaNdarrayType

 try:
    from theano.sandbox.cuda import cuda_ndarray
@@ -20,9 +21,10 @@ try:
 except (ImportError, OSError, RuntimeError, pkg_resources.DistributionNotFound):
    pass

-cusolver_handle = [None]
+cusolver_handle = None
+

-class GpuSolve(GpuOp):
+class GpuCusolverSolve(GpuOp):
    """
    CUSOLVER GPU solver OP.

@@ -37,10 +39,7 @@ class GpuSolve(GpuOp):

    def __init__(self, trans='N'):
        self.trans = trans
-        super(GpuSolve, self).__init__()
-
-    def output_type(self, inp):
-        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
+        super(GpuCusolverSolve, self).__init__()

    def make_node(self, inp1, inp2):
        inp1 = as_cuda_ndarray_variable(inp1)
@@ -48,7 +47,9 @@ class GpuSolve(GpuOp):

        assert inp1.ndim == 2
        assert inp2.ndim == 2
-        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
+        return theano.Apply(
+            self, [inp1, inp2],
+            [CudaNdarrayType(broadcastable=[False] * inp1.type.ndim)()])

    def make_thunk(self,
                   node,
@@ -56,29 +57,31 @@ class GpuSolve(GpuOp):
                   no_recycling=[]):
        if not cusolver_available:
            raise RuntimeError('CUSOLVER is not available and '
-                               'GpuSolve Op can not be constructed.')
+                               'GpuCusolverSolve Op can not be constructed.')

        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
-            # size of the matrices to invert
+            global cusolver_handle
+
+            # Size of the matrices to invert.
            z = outputs[0]

-            # Matrix
+            # Matrix.
            A = inputs[0][0]

-            # Solution vectors
+            # Solution vectors.
            b = inputs[1][0]

            # A is not explicitly converted between C and F order, instead we
-            # switch the "transpose" flag
+            # switch the "transpose" flag.
            if self.trans in ('T', 'C'):
                trans = 'N'
            else:
                trans = 'T'

-            # Convert b to F-order from c-order.
+            # Convert b to F-order from C-order.
            b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))

            # This copy forces allocation of a new C-contiguous buffer
@@ -86,67 +89,59 @@ class GpuSolve(GpuOp):
            A_cpy = A.copy()
            b_cpy = b_cpy.copy()

-            def cusolver_gpu_solve(A_, b_, trans='T'):
-                A_shape = A_.shape
-                b_shape = b_.shape
-
-                assert(len(A_shape) == 2)
-                assert(len(b_shape) == 2)
-
-                if trans in ['T', 'C']:
-                    trans = 1
-                    l, n = A_shape
-                    k, m = b_shape
-                    if n != k:
-                        raise ValueError('A and b must be aligned.')
-                elif trans in ['N']:
-                    trans = 0
-                    n, l = A_shape
-                    k, m = b_shape
-                    if l != m:
-                        raise ValueError('A and b must be aligned.')
-                else:
-                    raise ValueError('Invalid value for trans')
-
-                lda = max(1, n)
-                ldb = max(1, n, l)
-
-                A_ptr = A_.gpudata
-                b_ptr = b_.gpudata
-
-                if cusolver_handle[0] is None:
-                    cusolver_handle[0] = cusolver.cusolverDnCreate()
-
-                workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
-                        cusolver_handle[0], m, n, A_ptr, lda)
-                
-                if (thunk.workspace is None or 
-                    thunk.workspace.size != workspace_size):
-                    thunk.workspace = CudaNdarray.zeros((workspace_size,))
-                
-                if thunk.pivots is None or thunk.pivots.size != min(m, n):
-                    thunk.pivots = CudaNdarray.zeros((min(m, n),))
+            assert(len(A.shape) == 2)
+            assert(len(b.shape) == 2)
+
+            if trans in ['T', 'C']:
+                trans = 1
+                l, n = A.shape
+                k, m = b.shape
+                if n != k:
+                    raise ValueError('A and b must be aligned.')
+            elif trans in ['N']:
+                trans = 0
+                n, l = A.shape
+                k, m = b.shape
+                if l != m:
+                    raise ValueError('A and b must be aligned.')
+            else:
+                raise ValueError('Invalid value for trans')
+
+            lda = max(1, n)
+            ldb = max(1, n, l)
+
+            A_ptr = A_cpy.gpudata
+            b_ptr = b_cpy.gpudata

-                if thunk.dev_info is None:
-                    thunk.dev_info = CudaNdarray.zeros((1,))
+            if cusolver_handle is None:
+                cusolver_handle = cusolver.cusolverDnCreate()
+
+            workspace_size = cusolver.cusolverDnSgetrf_bufferSize(
+                cusolver_handle, m, n, A_ptr, lda)
+
+            if (thunk.workspace is None or
+                    thunk.workspace.size != workspace_size):
+                thunk.workspace = CudaNdarray.zeros((workspace_size,))

-                workspace_ptr = thunk.workspace.gpudata
-                pivots_ptr = thunk.pivots.gpudata
-                dev_info_ptr = thunk.dev_info.gpudata
+            if thunk.pivots is None or thunk.pivots.size != min(m, n):
+                thunk.pivots = CudaNdarray.zeros((min(m, n),))

-                cusolver.cusolverDnSgetrf(
-                        cusolver_handle[0], n, l, A_ptr, lda, workspace_ptr, 
-                        pivots_ptr, dev_info_ptr)
+            if thunk.dev_info is None:
+                thunk.dev_info = CudaNdarray.zeros((1,))

-                cusolver.cusolverDnSgetrs(
-                        cusolver_handle[0], trans, n, m, A_ptr, lda, 
-                        pivots_ptr, b_ptr, ldb, dev_info_ptr)
+            workspace_ptr = thunk.workspace.gpudata
+            pivots_ptr = thunk.pivots.gpudata
+            dev_info_ptr = thunk.dev_info.gpudata

-                return A_, b_
+            cusolver.cusolverDnSgetrf(
+                cusolver_handle, n, l, A_ptr, lda, workspace_ptr,
+                pivots_ptr, dev_info_ptr)

-            A_pycuda, b_pycuda = cusolver_gpu_solve(A_cpy, b_cpy, trans)
+            cusolver.cusolverDnSgetrs(
+                cusolver_handle, trans, n, m, A_ptr, lda,
+                pivots_ptr, b_ptr, ldb, dev_info_ptr)

-            # Convert b to F-order from c-order and assign it to output:
+            # Convert b to F-order from C-order and assign it to output.
            b_cpy = b_cpy.reshape(b.shape[::-1])
            b_cpy = dimshuffle(b_cpy, (1, 0))
            z[0] = b_cpy
@@ -161,4 +156,4 @@ class GpuSolve(GpuOp):

        return thunk

-gpu_solve = GpuSolve()
+gpu_solve = GpuCusolverSolve()
--- a/theano/sandbox/cuda/tests/test_cusolver.py
+++ b/theano/sandbox/cuda/tests/test_cusolver.py
+from __future__ import absolute_import, division, print_function
+
+import unittest
+import numpy
+import theano
+
+from theano.tests import unittest_tools as utt
+
+# Skip tests if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+from theano.misc.pycuda_init import pycuda_available
+from theano.sandbox.cuda.cusolver import cusolver_available
+
+from theano.sandbox.cuda import cusolver
+
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
+if not pycuda_available:
+    raise SkipTest('Optional package pycuda not available')
+if not cusolver_available:
+    raise SkipTest('Optional package scikits.cuda.cusolver not available')
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+
+
+class TestCula(unittest.TestCase):
+    def run_gpu_solve(self, A_val, x_val):
+        b_val = numpy.dot(A_val, x_val)
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+
+        solver = cusolver.gpu_solve(A, b)
+        fn = theano.function([A, b], [solver])
+        res = fn(A_val, b_val)
+        x_res = numpy.array(res[0])
+        utt.assert_allclose(x_res, x_val)
+
+    def test_diag_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
+                              dtype="float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
+
+    def test_sym_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_sym = (A_val + A_val.T) / 2.0
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_sym, x_val)
+
+    def test_orth_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_orth = numpy.linalg.svd(A_val)[0]
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_orth, x_val)
+
+    def test_uni_rand_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)