used ls again.

a2fe5c5d · Caglar · c4bcd45e · a2fe5c5d · a2fe5c5d
--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
@@ -19,6 +19,7 @@ if cula is not None:
 import numpy
 class GpuSolve(GpuOp):
    """
    CULA GPU solver OP.
@@ -55,53 +56,58 @@ class GpuSolve(GpuOp):
        def thunk():
            input_shape = inputs[1][0].shape
            #size of the matrices to invert
            z = outputs[0]
            #Matrix
            A = inputs[0][0]
            #Solution vectors
            b = inputs[1][0]
-            A_cpy = A.copy()
-            b_cpy = b.copy()
-            A_pycuda = to_gpuarray(A_cpy)
+            #A_cpy = A.copy()
-            b_pycuda = to_gpuarray(b_cpy)
+            #b_cpy = b.copy()
+            A_pycuda = to_gpuarray(A)
+            b_pycuda = to_gpuarray(b)
-            def cula_gpu_solve(A, b):
+            def cula_gpu_solve(A, b, trans='N'):
                A_shape = A.shape
                b_shape = b.shape
                assert(len(A_shape) == 2)
                assert(len(b_shape) == 2)
+                import string
-                if A_shape[0] != A_shape[1]:
+                if trans in ['T', 'C']:
-                    raise ValueError('Coefficient matrix should be a square matrix.')
+                    l, n = A_shape
+                    k, m = b_shape
+                elif trans in ['N']:
+                    n, l = A_shape
+                    k, m = b_shape
+                else:
+                    raise ValueError('Invalid value for trans')
-                n = A_shape[0]
+                if n != k:
-                nrhs = b_shape[1]
+                    raise ValueError('A and b must be aligned.')
-                #Create the integer pivot vector to store the indices for
-                #permutation matrix.
-                ipiv = CudaNdarray.zeros((n,))
-                ipiv = to_gpuarray(ipiv)
-                import string
+                if trans == 'N':
-                lda = max(1, n)
+                    lda = max(1, n)
-                ldb = max(1, n)
+                else:
+                    lda = max(1, l)
+                ldb = max(1, k)
                # construct pointer arrays needed for culaDeviceSgels
                # Cula requires you to pass a pointer for A and b.
-                A_ptr = A_cpy.gpudata
+                A_ptr = A.gpudata
-                b_ptr = b_cpy.gpudata
+                b_ptr = b.gpudata
-                ipiv_ptr = ipiv.gpudata
-                cula.culaDeviceSgesv(n, nrhs, A_ptr, lda, ipiv_ptr, b_ptr, ldb)
+                cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
                return A, b
-            A_pycuda, b_pycuda = cula_gpu_solve(A_pycuda, b_pycuda)
+            A_pycuda, b_pycuda = cula_gpu_solve(A_pycuda, b_pycuda, self.trans)
            z[0] = b
        thunk.inputs = inputs
@@ -110,4 +116,4 @@ class GpuSolve(GpuOp):
        return thunk
-gpu_solve = GpuSolve()
+gpu_solve = GpuSolve(trans="T")
--- a/theano/sandbox/cuda/tests/test_cula.py
+++ b/theano/sandbox/cuda/tests/test_cula.py
@@ -52,7 +52,8 @@ class TestCula(unittest.TestCase):
    def test_orth_solve(self):
        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
        A_orth = numpy.linalg.svd(A_val)[0]
-        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], A_orth.shape[0])).astype("float32")
+        #import ipdb; ipdb.set_trace()
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32")
        self.run_gpu_solve(A_orth, x_val)
    def test_uni_rand_solve(self):