Added the test.

ce066d1d · Caglar · 477b54a4 · ce066d1d · ce066d1d
--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
 import theano
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda import GpuOp
+from theano.sandbox.cuda import GpuOp, CudaNdarray
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous)
 from theano.tensor import as_tensor_variable
 from scikits.cuda import cula
+try:
+    from scikits.cuda import cula
+    scikits_cuda_available = True
+except ImportError:
+    scikits_cuda_available = False

-
-def cula_gpu_solve(A, b, trans='N'):
-    cula.culaInitialize()
-    A_shape = A.shape
-    b_shape = b.shape
-
-    assert(len(A_shape) == 2)
-    assert(len(b_shape) == 2)
-
-    import string
-
-    if trans in ['T', 'C']:
-        l, n = A_shape
-        m, k = b_shape
-    elif trans in ['N']:
-        n, l = A_shape
-        k, m = b_shape
-    else:
-        raise ValueError('Invalid value for trans')
-
-    if n != k:
-        raise ValueError('A and b must be aligned.')
-
-    if trans == 'n':
-        lda = max(1, n)
-    else:
-        lda = max(1, l)
-
-    ldb = max(1, k)
-
-    # construct pointer arrays needed for culaDeviceSgels
-    # Cula requires you to pass a pointer for A and b.
-    A_ptr = A.gpudata
-    b_ptr = b.gpudata
-
-    cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
-    return A, b
+import numpy

 class GpuSolve(GpuOp):
    """
-    Cula Gpu solver OP.
+    CULA GPU solver OP.
+
    """
    def __init__(self, trans='N'):
        self.trans = trans
@@ -59,7 +29,7 @@ class GpuSolve(GpuOp):
        return hash(type(self))

    def output_type(self, inp):
-        return cuda.CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
+        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)

    def make_node(self, inp1, inp2):
        inp1 = gpu_contiguous(as_cuda_ndarray_variable(inp1))
@@ -71,32 +41,58 @@ class GpuSolve(GpuOp):
        assert inp2.ndim == 2
        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])

-    def make_thunk(self, node, storage_map, _, _2):
+    def make_thunk(self, node, storage_map, _, no_recycling=[]):
        from theano.misc.pycuda_utils import to_gpuarray
+
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
-            input_shape = inputs[0][0].shape
-
+            input_shape = inputs[1][0].shape
            #size of the matrices to invert
-            size = input_shape[1]
-
            z = outputs[0]
-
-            if z[0] is None or z[0].shape != input_shape:
-                z[0] = cuda.CudaNdarray.zeros(input_shape)
-
            #Matrix
            A = inputs[0][0]

            #Solution vectors
-            b = inputs[0][1]
+            b = inputs[1][0]

            A_pycuda = to_gpuarray(A)
            b_pycuda = to_gpuarray(b)

-            cula_gpu_solve(A_pycuda, b_pycuda, self.trans)
+            def cula_gpu_solve(A, b):
+
+                cula.culaInitialize()
+                A_shape = A.shape
+                b_shape = b.shape
+                assert(len(A_shape) == 2)
+                assert(len(b_shape) == 2)
+
+                if A_shape[0] != A_shape[1]:
+                    raise ValueError('Coefficient matrix should be a square matrix.')
+
+                n = A_shape[0]
+                nrhs = b_shape[1]
+                #Create the integer pivot vector to store the indices for
+                #permutation matrix.
+                ipiv = CudaNdarray.zeros((n,))
+                ipiv = to_gpuarray(ipiv)
+
+                import string
+                lda = max(1, n)
+                ldb = max(1, n)
+
+                # construct pointer arrays needed for culaDeviceSgels
+                # Cula requires you to pass a pointer for A and b.
+                A_ptr = A.gpudata
+                b_ptr = b.gpudata
+                ipiv_ptr = ipiv.gpudata
+
+                cula.culaDeviceSgesv(n, nrhs, A_ptr, lda, ipiv_ptr, b_ptr, ldb)
+                return A, b
+
+            A, b = cula_gpu_solve(A_pycuda, b_pycuda)
+            z[0] = b

        thunk.inputs = inputs
        thunk.outputs = outputs

--- a/theano/sandbox/cuda/tests/test_cula.py
+++ b/theano/sandbox/cuda/tests/test_cula.py
+import unittest
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+
+# Skip tests if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
+from theano.misc.pycuda_init import pycuda_available
+from theano.sandbox.cuda.cula import scikits_cuda_available
+
+if not pycuda_available:
+    raise SkipTest('Optional package pycuda not available')
+if not scikits_cuda_available:
+    raise SkipTest('Optional package scikits.cuda not available')
+
+from theano.sandbox.cuda import cula
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+
+class TestCula(unittest.TestCase):
+
+    def run_gpu_solve(self, A_val, x_val):
+        b_val = numpy.dot(A_val, x_val)
+        x_res = numpy.zeros((x_val.shape[0], x_val.shape[1])).astype("float32")
+
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+        solver = cula.gpu_solve(A, b)
+        fn = theano.function([A, b], [solver])
+        res = fn(A_val, b_val)
+        res[0].get(x_res)
+        utt.assert_allclose(x_res, x_val)
+
+    def test_diag_solve(self):
+        A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]], dtype="float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
+
+    def test_sym_solve(self):
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_sym = (A_val + A_val.T) / 2.0
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
+        self.run_gpu_solve(A_sym, x_val)
+
+    def test_orth_solve(self):
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_orth = numpy.linalg.svd(A_val)[0]
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], A_orth.shape[0])).astype("float32")
+        self.run_gpu_solve(A_orth, x_val)
+
+    def test_uni_rand_solve(self):
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)