Added the opt code.

7c95b025 · Caglar · a2fe5c5d · 7c95b025 · 7c95b025 · 7c95b025
--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
@@ -7,6 +7,7 @@ from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,

 from theano.tensor import as_tensor_variable
 from scikits.cuda import cula
+from theano.sandbox.cuda import cuda_ndarray

 try:
    from scikits.cuda import cula
@@ -19,13 +20,15 @@ if cula is not None:

 import numpy

-
 class GpuSolve(GpuOp):
    """
    CULA GPU solver OP.

+    trans: Whether to take the transpose of the input matrix or not. By default,
+    we will take the transpose of the input matrix, before feeding it into the Op.
+    That is mainly, because that CULA requires inputs to be in Fortran order.
    """
-    def __init__(self, trans='N'):
+    def __init__(self, trans='T'):
        self.trans = trans
        super(GpuSolve, self).__init__()

@@ -48,7 +51,11 @@ class GpuSolve(GpuOp):
        assert inp2.ndim == 2
        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])

-    def make_thunk(self, node, storage_map, _, no_recycling=[]):
+    def make_thunk(self,
+                   node,
+                   storage_map, _,
+                   no_recycling=[]):
+
        from theano.misc.pycuda_utils import to_gpuarray

        inputs = [storage_map[v] for v in node.inputs]
@@ -56,53 +63,51 @@ class GpuSolve(GpuOp):

        def thunk():
            input_shape = inputs[1][0].shape
+
            #size of the matrices to invert
            z = outputs[0]
+
            #Matrix
            A = inputs[0][0]

            #Solution vectors
            b = inputs[1][0]

-            #A_cpy = A.copy()
-            #b_cpy = b.copy()
+            b = cuda_ndarray.dimshuffle(b, 1, 0)
+            b_cpy = b.copy()

            A_pycuda = to_gpuarray(A)
            b_pycuda = to_gpuarray(b)

-            def cula_gpu_solve(A, b, trans='N'):
+            def cula_gpu_solve(A_, b_, trans='T'):

-                A_shape = A.shape
-                b_shape = b.shape
+                A_shape = A_.shape
+                b_shape = b_.shape

                assert(len(A_shape) == 2)
                assert(len(b_shape) == 2)
-                import string

                if trans in ['T', 'C']:
                    l, n = A_shape
                    k, m = b_shape
+                    if n != m:
+                       raise ValueError('A and b must be aligned.')
                elif trans in ['N']:
                    n, l = A_shape
                    k, m = b_shape
+                    if l != m:
+                       raise ValueError('A and b must be aligned.')
                else:
                    raise ValueError('Invalid value for trans')

-                if n != k:
-                    raise ValueError('A and b must be aligned.')
-
-
-                if trans == 'N':
-                    lda = max(1, n)
-                else:
-                    lda = max(1, l)

-                ldb = max(1, k)
+                lda = max(1, n)
+                ldb = max(1, n, l)

                # construct pointer arrays needed for culaDeviceSgels
                # Cula requires you to pass a pointer for A and b.
-                A_ptr = A.gpudata
-                b_ptr = b.gpudata
+                A_ptr = A_.gpudata
+                b_ptr = b_.gpudata

                cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
                return A, b
@@ -116,4 +121,4 @@ class GpuSolve(GpuOp):

        return thunk

-gpu_solve = GpuSolve(trans="T")
+gpu_solve = GpuSolve()
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import (
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
+
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
        GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
+
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
+from theano.sandbox.cuda.cula import gpu_solve
+
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
        GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
+
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
        GpuCrossentropySoftmax1HotWithBiasDx,
        GpuSoftmax, GpuSoftmaxWithBias)
+
 from theano.sandbox.cuda.elemwise import SupportCodeError
 from theano.scalar.basic_scipy import Erfinv
 from theano.sandbox.cuda.elemwise import erfinv_gpu
@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
+
 from theano.tensor import nlinalg
+from theano.tensor import slinalg
+
 from theano.tensor.nnet.Conv3D import Conv3D

 try:
@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node):
    return False


+@register_opt()
+@local_optimizer([gpu_from_host, slinalg.Solve])
+def local_gpu_solve(node):
+    """
+    gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
+    CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
+    """
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if (host_input.owner and
+            isinstance(host_input.owner.op,
+                       slinalg.Solve)):
+            x, y = host_input.owner.inputs
+            return [gpu_solve(gpu_from_host(x), gpu_from_host(y))]
+
+    if isinstance(node.op, slinalg.Solve):
+        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                for i in node.inputs]):
+            x, y = node.inputs
+            return [host_from_gpu(
+                    gpu_solve(gpu_from_host(x),
+                                gpu_from_host(y)))]
+    return False
+
+
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
 def local_gpu_gemv(node):

--- a/theano/sandbox/cuda/tests/test_cula.py
+++ b/theano/sandbox/cuda/tests/test_cula.py
@@ -28,14 +28,15 @@ class TestCula(unittest.TestCase):

    def run_gpu_solve(self, A_val, x_val):
        b_val = numpy.dot(A_val, x_val)
-        x_res = numpy.zeros((x_val.shape[0], x_val.shape[1])).astype("float32")
-
+        b_val = b_val.T.reshape((b_val.shape[0], b_val.shape[1]))
        A = theano.tensor.matrix("A", dtype="float32")
        b = theano.tensor.matrix("b", dtype="float32")
+
        solver = cula.gpu_solve(A, b)
        fn = theano.function([A, b], [solver])
        res = fn(A_val, b_val)
        x_res = numpy.array(res[0])
+        x_res = x_res.reshape((x_res.shape[1], x_res.shape[0])).T
        utt.assert_allclose(x_res, x_val)

    def test_diag_solve(self):
@@ -52,11 +53,10 @@ class TestCula(unittest.TestCase):
    def test_orth_solve(self):
        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
        A_orth = numpy.linalg.svd(A_val)[0]
-        #import ipdb; ipdb.set_trace()
        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1], 1)).astype("float32")
        self.run_gpu_solve(A_orth, x_val)

    def test_uni_rand_solve(self):
        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
-        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 1)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1], 4)).astype("float32")
        self.run_gpu_solve(A_val, x_val)
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -536,12 +536,34 @@ def test_erfinvgpu():
    assert numpy.allclose(f(xv), f2(xv))


+def test_local_gpu_solve():
+    def cmp(a_shp, b_shp):
+        a0 = numpy.random.uniform(-0.4, 0.4, a_shp).astype('float32')
+        a = cuda.shared_constructor(a0, 'a')
+
+        b0 = numpy.random.uniform(-0.4, 0.4, b_shp).astype('float32')
+        b = cuda.shared_constructor(b0, 'b')
+
+        f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
+
+        assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
+                          cuda.cula.GpuSolve)
+
+        assert cuda.opt.local_gpu_solve.transform(
+            tensor.slinalg.solve(a, b).owner)
+        out = f()
+        assert numpy.allclose(numpy.dot(a0, out), b0)
+
+    cmp((6, 6), (6, 1))
+    cmp((5, 5), (5, 3))
+
+
 def test_local_gpu_dot_to_dot22dot():
    def cmp(a_shp, b_shp):
        a0 = numpy.random.rand(*a_shp).astype('float32')
        a = cuda.shared_constructor(a0, 'a')
        b0 = numpy.random.rand(*b_shp).astype('float32')
-        b = cuda.shared_constructor(b0, 'a')
+        b = cuda.shared_constructor(b0, 'b')

        f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
        assert cuda.opt.local_gpu_dot_to_dot22.transform(