Merge pull request #410 from lamblin/gpu_tensordot

Optimization to move tensordot to GPU

Merge pull request #410 from lamblin/gpu_tensordot
41944823 · nouiz · 49a5a90e · 57245521 · 41944823 · 41944823
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -126,6 +126,10 @@ New features:
 * sparse_variable[M:N, O:P] now works (Li Yao, Frederic)
    * Warning: M, N, O, and P should be Python int or scalar tensor variables,
      in particular, None is not well-supported.
+ * tensor.tensordot can now be moved to GPU (Sander Dieleman,
+   Pascal, based on code from Tijmen Tieleman's gnumpy,
+   http://www.cs.toronto.edu/~tijmen/gnumpy.html)
 New optimizations:
 * AdvancedSubtensor1 reuses preallocated memory if available (scan, c|py_nogc linker) (Frederic)

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2059,32 +2059,53 @@ class GpuContiguous(Op):
 gpu_contiguous = GpuContiguous()
 def tensordot(a, b, axes=2):
    """
-    implementation of tensordot that reduces to a regular matrix product. This allows tensordot to be GPU accelerated,
+    Implementation of tensordot that reduces to a regular matrix product.
-    which isn't possible with the default Theano implementation (which is just a wrapper around numpy.tensordot).
-    based on code from Tijmen Tieleman's gnumpy http://www.cs.toronto.edu/~tijmen/gnumpy.html
+    This allows tensordot to be GPU accelerated, which isn't possible
+    with the default Theano implementation (which is just a wrapper
+    around numpy.tensordot). based on code from Tijmen Tieleman's gnumpy
+    http://www.cs.toronto.edu/~tijmen/gnumpy.html
    """
    if numpy.isscalar(axes):
        # if 'axes' is a number of axes to multiply and sum over (trailing axes
-        # of a, leading axes of b), we can just reshape and use dot.         
+        # of a, leading axes of b), we can just reshape and use dot.
-        outshape = tensor.concatenate([a.shape[:a.ndim - axes], b.shape[axes:]])
+        outshape = tensor.concatenate([a.shape[:a.ndim - axes],
-        outndim = a.ndim + b.ndim - 2*axes
+                                      b.shape[axes:]])
-        a_reshaped = a.reshape((tensor.prod(a.shape[:a.ndim - axes]), tensor.prod(a.shape[a.ndim - axes:])))
+        outndim = a.ndim + b.ndim - (2 * axes)
-        b_reshaped = b.reshape((tensor.prod(b.shape[:axes]), tensor.prod(b.shape[axes:])))
+        a_reshaped = a.reshape((tensor.prod(a.shape[:a.ndim - axes]),
-        return tensor.dot(a_reshaped, b_reshaped).reshape(outshape, ndim=outndim)
+                                tensor.prod(a.shape[a.ndim - axes:])))
+        b_reshaped = b.reshape((tensor.prod(b.shape[:axes]),
+                                tensor.prod(b.shape[axes:])))
+        assert a_reshaped.ndim == 2
+        assert b_reshaped.ndim == 2
+        # We use _dot22 here because:
+        #   - we know that the number of dimensions will be 2
+        #   - it makes it possible for the computation to be moved to GPU
+        # When cuda.opt.local_gpu_tensordot is applied, it is too late
+        # for the usual blas optimizations to take place.
+        # This will change if we decide to get rid of tensor.tensordot,
+        # and always use this version.
+        return tensor.blas._dot22(a_reshaped, b_reshaped).reshape(
+                outshape, ndim=outndim)
    elif len(axes) == 2:
        # if 'axes' is a pair of axis lists, we first shuffle the axes of a and
        # b to reduce this to the first case (note the recursion).
        a_other, b_other = tuple(axes[0]), tuple(axes[1])
        num_axes = len(a_other)
-        a_order = tuple(x for x in tuple(xrange(a.ndim)) if x not in a_other) + a_other
+        a_order = (tuple(x for x in tuple(xrange(a.ndim)) if x not in a_other)
-        b_order = b_other + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_other)
+                + a_other)
+        b_order = (b_other
+                + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_other))
        a_shuffled = a.dimshuffle(a_order)
        b_shuffled = b.dimshuffle(b_order)
        return tensordot(a_shuffled, b_shuffled, num_axes)
    else:
-        raise ValueError("Axes should be scalar valued or a list/tuple of len 2.")
+        raise ValueError(
+            "Axes should be scalar valued or a list/tuple of len 2.",
+            axes)
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Useful mostly for test as the gpu op are inserted automatically...

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -746,6 +746,36 @@ def local_gpu_print_op(node):
            return [host_from_gpu(new_op(gpu_x))]
    return False
+@register_opt()
+@local_optimizer([tensor.TensorDot])
+def local_gpu_tensordot(node):
+    '''
+    T.tensordot(host_from_gpu) -> basic_ops.tensordot(host_from_gpu)
+    There is no Cuda Op for tensordot, however we can build a chain of
+    CPU Ops implementing tensordot. These Ops all have a GPU equivalent.
+    Note: applying this optimization at that stage is not ideal, because
+    all blas-related optimizations have already been applied.
+    However, if we want to apply it before the blas optimizations, then
+    we don't know which variables may end up on the GPU or not.
+    '''
+    if (isinstance(node.op, tensor.TensorDot) and
+            node.outputs[0].dtype == 'float32'):
+        x, y = node.inputs
+        if ((x.owner and
+                x.owner.op == host_from_gpu and
+                y.dtype=='float32') or
+            (y.owner and
+                y.owner.op == host_from_gpu and
+                x.dtype=='float32')):
+            axes = node.op.axes
+            out = tensordot(x, y, axes=axes)
+            return [out]
 def cast(x, dtype):
    stype = scal.Scalar(dtype)
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -870,35 +870,53 @@ def test_shared_cudandarray():
    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3)))
    assert isinstance(a.type, tcn.CudaNdarrayType)
-def test_tensordot_reshape():
-    '''Test that the tensordot implementation using dimshuffle, reshape and dot
+class test_tensordot_reshape(unittest.TestCase):
-    gives the same results as the default (numpy) version'''
+    '''Test alternative tensordot implementation.
-    # define some tensors
-    a = numpy.arange(20, dtype=theano.config.floatX) / 20.0
+    Test that the tensordot implementation using dimshuffle, reshape and dot
-    b = numpy.arange(10, dtype=theano.config.floatX) / 10.0
+    gives the same results as the default (numpy) version.
-    c = numpy.arange(5, dtype=theano.config.floatX) / 5.0
+    '''
-    d = numpy.arange(8, dtype=theano.config.floatX) / 8.0
+    def setUp(self):
-    tensor1 = numpy.tensordot(a, numpy.tensordot(b, numpy.tensordot(c, d, 0), 0), 0)
+        self.rng = numpy.random.RandomState(utt.fetch_seed())
-    tensor2 = numpy.tensordot(c, numpy.tensordot(d, a, 0), 0)
-    tensor3 = tensor2.swapaxes(1, 2).swapaxes(0, 2) # d, a, c
+    def test1(self):
+        # define some tensors
-    x = T.tensor4('x')
+        tensor1 = self.rng.rand(20, 10, 5, 8).astype(theano.config.floatX)
-    y = T.tensor3('y')
+        tensor2 = self.rng.rand(5, 8, 20).astype(theano.config.floatX)
+        tensor3 = self.rng.rand(8, 20, 5).astype(theano.config.floatX)
-    # case 1: number of axes to sum over
-    default1 = theano.function([x,y], T.tensordot(x, y, 2))(tensor1, tensor2)
+        x = T.tensor4('x')
-    reshape1 = theano.function([x,y], B.tensordot(x, y, 2))(tensor1, tensor2)
+        y = T.tensor3('y')
-    assert numpy.allclose(default1, reshape1)
+        # case 1: number of axes to sum over
-    # case 2: axis pairs
+        default1 = theano.function([x, y], T.tensordot(x, y, 2))(
-    default2 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3)
+                tensor1, tensor2)
-    reshape2 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3)
+        reshape1 = theano.function([x, y], B.tensordot(x, y, 2))(
-    assert numpy.allclose(default2, reshape2)
+                tensor1, tensor2)
+        assert numpy.allclose(default1, reshape1)
-    default3 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3)
-    reshape3 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3)
+        # case 2: axis pairs
-    assert numpy.allclose(default3, reshape3)
+        default2 = theano.function(
+                [x, y],
+                T.tensordot(x, y, axes=[(0, 3), (1, 0)])
+                )(tensor1, tensor3)
+        reshape2 = theano.function(
+                [x, y],
+                B.tensordot(x, y, axes=[(0, 3), (1, 0)])
+                )(tensor1, tensor3)
+        assert numpy.allclose(default2, reshape2)
+        default3 = theano.function(
+                [x, y],
+                T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
+                )(tensor1, tensor3)
+        reshape3 = theano.function(
+                [x, y],
+                B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
+                )(tensor1, tensor3)
+        assert numpy.allclose(default3, reshape3)
 class test_size(unittest.TestCase):

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
-import sys, time
+import sys, time, unittest
 import numpy
 # Skip test if cuda_ndarray is not available.
@@ -8,6 +8,8 @@ from theano.compile.pfunc import pfunc
 from theano import config, tensor
 import theano
+from theano.tests import unittest_tools as utt
 import theano.sandbox.cuda as cuda
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
@@ -246,6 +248,38 @@ def test_elemwise_fusion():
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
+class test_local_gpu_tensordot(unittest.TestCase):
+    def setUp(self):
+        self.rng = numpy.random.RandomState(utt.fetch_seed())
+    def test_transfer(self):
+        tensor1 = self.rng.rand(20, 10, 5, 8).astype('float32')
+        tensor2 = self.rng.rand(5, 8, 20).astype('float32')
+        tensor3 = self.rng.rand(8, 20, 5).astype('float32')
+        x = tensor.ftensor4('x')
+        y = tensor.ftensor3('y')
+        tdot1 = tensor.tensordot(x, y, 2)
+        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
+        topo1 = f1.maker.env.toposort()
+        assert topo1[-1].op == cuda.host_from_gpu
+        # Let DebugMode debug
+        f1(tensor1, tensor2)
+        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
+        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
+        topo2 = f2.maker.env.toposort()
+        assert topo2[-1].op == cuda.host_from_gpu
+        f2(tensor1, tensor3)
+        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
+        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
+        topo3 = f3.maker.env.toposort()
+        assert topo3[-1].op == cuda.host_from_gpu
+        f3(tensor1, tensor3)
 if __name__ == '__main__':
    test_gpualloc()
    test_opt_gpujoin_onlyajoin()