Merge pull request #2306 from caglar/theano_solve

Theano solve GPU Cula Op

Merge pull request #2306 from caglar/theano_solve
e972e956 · Pascal Lamblin · 63b7b834 · 5b2a9803 · e972e956 · e972e956
--- a/theano/sandbox/cuda/cula.py
+++ b/theano/sandbox/cuda/cula.py
+import theano
+from theano.sandbox.cuda.type import CudaNdarrayType
+from theano.sandbox.cuda import GpuOp
+
+from theano.sandbox.cuda.basic_ops import as_cuda_ndarray_variable
+from theano.sandbox.cuda import cuda_ndarray
+
+dimshuffle = cuda_ndarray.cuda_ndarray.dimshuffle
+
+cula_available = False
+
+try:
+    from scikits.cuda import cula
+    cula_available = True
+except (ImportError, OSError):
+    pass
+
+cula_initialized = False
+
+
+class GpuSolve(GpuOp):
+    """
+    CULA GPU solver OP.
+
+    :param trans: Whether to take the transpose of the input matrix
+    or not.
+    """
+    __props__ = ('trans',)
+
+    def __init__(self, trans='N'):
+        self.trans = trans
+        super(GpuSolve, self).__init__()
+
+    def output_type(self, inp):
+        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)
+
+    def make_node(self, inp1, inp2):
+        inp1 = as_cuda_ndarray_variable(inp1)
+        inp2 = as_cuda_ndarray_variable(inp2)
+
+        assert inp1.ndim == 2
+        assert inp2.ndim == 2
+        return theano.Apply(self, [inp1, inp2], [self.output_type(inp1)()])
+
+    def make_thunk(self,
+                   node,
+                   storage_map, _,
+                   no_recycling=[]):
+
+        # Initialize CULA the first time it is needed
+        global cula_initialized
+        if cula_available and cula and not cula_initialized:
+            cula.culaInitialize()
+            cula_initialized = True
+
+        inputs = [storage_map[v] for v in node.inputs]
+        outputs = [storage_map[v] for v in node.outputs]
+
+        def thunk():
+            # size of the matrices to invert
+            z = outputs[0]
+
+            # Matrix
+            A = inputs[0][0]
+
+            # Solution vectors
+            b = inputs[1][0]
+
+            # A is not explicitly converted between C and F order, instead we
+            # switch the "transpose" flag
+            if self.trans in ('T', 'C'):
+                trans = 'N'
+            else:
+                trans = 'T'
+
+            # Convert b to F-order from c-order.
+            b_cpy = dimshuffle(b, (1, 0)).reshape((b.shape[0], b.shape[1]))
+
+            # This copy forces allocation of a new C-contiguous buffer
+            # and returns it.
+            A_cpy = A.copy()
+            b_cpy = b_cpy.copy()
+
+            def cula_gpu_solve(A_, b_, trans='T'):
+
+                A_shape = A_.shape
+                b_shape = b_.shape
+
+                assert(len(A_shape) == 2)
+                assert(len(b_shape) == 2)
+
+                if trans in ['T', 'C']:
+                    l, n = A_shape
+                    k, m = b_shape
+                    if n != k:
+                        raise ValueError('A and b must be aligned.')
+                elif trans in ['N']:
+                    n, l = A_shape
+                    k, m = b_shape
+                    if l != m:
+                        raise ValueError('A and b must be aligned.')
+                else:
+                    raise ValueError('Invalid value for trans')
+
+                lda = max(1, n)
+                ldb = max(1, n, l)
+
+                # construct pointer arrays needed for culaDeviceSgels
+                # Cula requires you to pass a pointer for A and b.
+                A_ptr = A_.gpudata
+                b_ptr = b_.gpudata
+
+                cula.culaDeviceSgels(trans, n, l, m, A_ptr, lda, b_ptr, ldb)
+                return A_, b_
+
+            A_pycuda, b_pycuda = cula_gpu_solve(A_cpy, b_cpy, trans)
+
+            # Convert b to F-order from c-order and assign it to output:
+            b_cpy = b_cpy.reshape(b.shape[::-1])
+            b_cpy = dimshuffle(b_cpy, (1, 0))
+            z[0] = b_cpy
+
+        thunk.inputs = inputs
+        thunk.outputs = outputs
+        thunk.lazy = False
+
+        return thunk
+
+gpu_solve = GpuSolve()
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -25,21 +25,27 @@ from theano.sandbox.cuda.basic_ops import (
    GpuSubtensor, GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
    GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit)
+
 from theano.sandbox.cuda.type import CudaNdarrayType
 from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
        gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
        GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
        GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
+
 from theano.sandbox.cuda.blas import gpu_gemv_inplace
+from theano.sandbox.cuda.cula import gpu_solve
+
 from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
 from theano.sandbox.cuda.blas import gpu_ger_inplace
 from theano.sandbox.cuda.blas import gpu_ger_no_inplace
 from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
        GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad)
+
 from theano.sandbox.cuda.nnet import (
        GpuCrossentropySoftmaxArgmax1HotWithBias,
        GpuCrossentropySoftmax1HotWithBiasDx,
        GpuSoftmax, GpuSoftmaxWithBias)
+
 from theano.sandbox.cuda.elemwise import SupportCodeError
 from theano.scalar.basic_scipy import Erfinv
 from theano.sandbox.cuda.elemwise import erfinv_gpu
@@ -47,7 +53,10 @@ from theano.sandbox.cuda.var import CudaNdarrayConstant
 from theano.sandbox.cuda import gpu_optimizer, register_opt, gpu_seqopt, GpuOp
 from theano.scan_module import scan_utils, scan_op, scan_opt
 from theano.tensor.blas import _is_real_vector, _is_real_matrix
+
 from theano.tensor import nlinalg
+from theano.tensor import slinalg
+
 from theano.tensor.nnet.Conv3D import Conv3D

 try:
@@ -540,6 +549,31 @@ def local_gpu_dot22scalar(node):
    return False


+@register_opt()
+@local_optimizer([gpu_from_host, slinalg.Solve])
+def local_gpu_solve(node):
+    """
+    gpu_from_host(CpuSolve) -> GpuSolve(gpu_from_host)
+    CpuSolve(host_from_gpu) -> host_from_gpu(GpuSolve)
+    """
+    if isinstance(node.op, GpuFromHost):
+        host_input = node.inputs[0]
+        if (host_input.owner and
+            isinstance(host_input.owner.op,
+                       slinalg.Solve)):
+            x, y = host_input.owner.inputs
+            return [gpu_solve(gpu_from_host(x), gpu_from_host(y))]
+
+    if isinstance(node.op, slinalg.Solve):
+        if any([i.owner and isinstance(i.owner.op, HostFromGpu)
+                for i in node.inputs]):
+            x, y = node.inputs
+            return [host_from_gpu(
+                    gpu_solve(gpu_from_host(x),
+                                gpu_from_host(y)))]
+    return False
+
+
 @register_opt()
 @local_optimizer([gpu_from_host, tensor.blas_c.CGemv, tensor.blas.Gemv])
 def local_gpu_gemv(node):

--- a/theano/sandbox/cuda/tests/test_cula.py
+++ b/theano/sandbox/cuda/tests/test_cula.py
+import unittest
+import numpy
+
+import theano
+from theano.tests import unittest_tools as utt
+
+# Skip tests if cuda_ndarray is not available.
+from nose.plugins.skip import SkipTest
+import theano.sandbox.cuda as cuda_ndarray
+if not cuda_ndarray.cuda_available:
+    raise SkipTest('Optional package cuda not available')
+from theano.misc.pycuda_init import pycuda_available
+from theano.sandbox.cuda.cula import cula_available
+
+if not pycuda_available:
+    raise SkipTest('Optional package pycuda not available')
+if not cula_available:
+    raise SkipTest('Optional package scikits.cuda.cula not available')
+
+from theano.sandbox.cuda import cula
+
+if theano.config.mode == 'FAST_COMPILE':
+    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
+else:
+    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
+
+
+class TestCula(unittest.TestCase):
+    def run_gpu_solve(self, A_val, x_val):
+        b_val = numpy.dot(A_val, x_val)
+        A = theano.tensor.matrix("A", dtype="float32")
+        b = theano.tensor.matrix("b", dtype="float32")
+
+        solver = cula.gpu_solve(A, b)
+        fn = theano.function([A, b], [solver])
+        res = fn(A_val, b_val)
+        x_res = numpy.array(res[0])
+        utt.assert_allclose(x_res, x_val)
+
+    def test_diag_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.asarray([[2, 0, 0], [0, 1, 0], [0, 0, 1]],
+                              dtype="float32")
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
+
+    def test_sym_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_sym = (A_val + A_val.T) / 2.0
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_val.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_sym, x_val)
+
+    def test_orth_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        A_orth = numpy.linalg.svd(A_val)[0]
+        x_val = numpy.random.uniform(-0.4, 0.4, (A_orth.shape[1],
+                                     1)).astype("float32")
+        self.run_gpu_solve(A_orth, x_val)
+
+    def test_uni_rand_solve(self):
+        numpy.random.seed(1)
+        A_val = numpy.random.uniform(-0.4, 0.4, (5, 5)).astype("float32")
+        x_val = numpy.random.uniform(-0.4, 0.4,
+                                     (A_val.shape[1], 4)).astype("float32")
+        self.run_gpu_solve(A_val, x_val)
--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -536,12 +536,38 @@ def test_erfinvgpu():
    assert numpy.allclose(f(xv), f2(xv))


+def test_local_gpu_solve():
+    numpy.random.seed(1)
+
+    def cmp(a_shp, b_shp):
+        a0 = numpy.random.uniform(-0.4, 0.4,
+                                  a_shp).astype('float32')
+        a = cuda.shared_constructor(a0, 'a')
+
+        b0 = numpy.random.uniform(-0.4, 0.4,
+                                  b_shp).astype('float32')
+        b = cuda.shared_constructor(b0, 'b')
+
+        f = pfunc([], tensor.slinalg.solve(a, b), mode=mode_with_gpu)
+
+        assert isinstance(f.maker.fgraph.toposort()[1].inputs[0].owner.op,
+                          cuda.cula.GpuSolve)
+
+        assert cuda.opt.local_gpu_solve.transform(
+            tensor.slinalg.solve(a, b).owner)
+        out = f()
+        assert numpy.allclose(numpy.dot(a0, out), b0)
+
+    cmp((6, 6), (6, 1))
+    cmp((5, 5), (5, 1))
+
+
 def test_local_gpu_dot_to_dot22dot():
    def cmp(a_shp, b_shp):
        a0 = numpy.random.rand(*a_shp).astype('float32')
        a = cuda.shared_constructor(a0, 'a')
        b0 = numpy.random.rand(*b_shp).astype('float32')
-        b = cuda.shared_constructor(b0, 'a')
+        b = cuda.shared_constructor(b0, 'b')

        f = pfunc([], tensor.dot(a, b), mode=mode_with_gpu)
        assert cuda.opt.local_gpu_dot_to_dot22.transform(