Optimization to move tensordot to GPU

44020007 · Pascal Lamblin · 120ad7d7 · 44020007 · 44020007
--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -746,6 +746,37 @@ def local_gpu_print_op(node):
            return [host_from_gpu(new_op(gpu_x))]
    return False
+@register_opt()
+@local_optimizer([tensor.TensorDot])
+def local_gpu_tensordot(node):
+    '''
+    T.tensordot(host_from_gpu) -> basic_ops.tensordot(host_from_gpu)
+    There is no Cuda Op for tensordot, however we can build a chain of
+    CPU Ops implementing tensordot. These Ops all have a GPU equivalent.
+    Note: applying this optimization at that stage is not ideal, because
+    all blas-related optimizations have already been applied.
+    However, if we want to apply it before the blas optimizations, then
+    we don't know which variables may end up on the GPU or not.
+    '''
+    if (isinstance(node.op, tensor.TensorDot) and
+            node.outputs[0].dtype == 'float32'):
+        x, y = node.inputs
+        transfer = False
+        if ((x.owner and
+                x.owner.op == host_from_gpu and
+                y.dtype=='float32') or
+            (y.owner and
+                y.owner.op == host_from_gpu and
+                x.dtype=='float32')):
+            axes = node.op.axes
+            out = tensordot(x, y, axes=axes)
+            return [out]
 def cast(x, dtype):
    stype = scal.Scalar(dtype)
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
-import sys, time
+import sys, time, unittest
 import numpy
 # Skip test if cuda_ndarray is not available.
@@ -8,6 +8,8 @@ from theano.compile.pfunc import pfunc
 from theano import config, tensor
 import theano
+from theano.tests import unittest_tools as utt
 import theano.sandbox.cuda as cuda
 if cuda.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')
@@ -246,6 +248,38 @@ def test_elemwise_fusion():
    f(theano._asarray(numpy.random.rand(*shape), dtype='float32'), theano._asarray(numpy.random.rand(*shape), dtype='float32'))
+class test_local_gpu_tensordot(unittest.TestCase):
+    def setUp(self):
+        self.rng = numpy.random.RandomState(utt.fetch_seed())
+    def test_transfer(self):
+        tensor1 = self.rng.rand(20, 10, 5, 8).astype('float32')
+        tensor2 = self.rng.rand(5, 8, 20).astype('float32')
+        tensor3 = self.rng.rand(8, 20, 5).astype('float32')
+        x = tensor.ftensor4('x')
+        y = tensor.ftensor3('y')
+        tdot1 = tensor.tensordot(x, y, 2)
+        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
+        topo1 = f1.maker.env.toposort()
+        assert topo1[-1].op == cuda.host_from_gpu
+        # Let DebugMode debug
+        f1(tensor1, tensor2)
+        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
+        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
+        topo2 = f2.maker.env.toposort()
+        assert topo2[-1].op == cuda.host_from_gpu
+        f2(tensor1, tensor3)
+        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
+        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
+        topo3 = f3.maker.env.toposort()
+        assert topo3[-1].op == cuda.host_from_gpu
+        f3(tensor1, tensor3)
 if __name__ == '__main__':
    test_gpualloc()
    test_opt_gpujoin_onlyajoin()