Merge pull request #334 from benanne/gpu_tensordot

gpu accelerated tensordot by conversion to matrix product

Merge pull request #334 from benanne/gpu_tensordot
1b773bb2 · lamblin · 5465a90b · 549d73cc · 1b773bb2 · 1b773bb2
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2059,6 +2059,33 @@ class GpuContiguous(Op):
 gpu_contiguous = GpuContiguous()
+def tensordot(a, b, axes=2):
+    """
+    implementation of tensordot that reduces to a regular matrix product. This allows tensordot to be GPU accelerated,
+    which isn't possible with the default Theano implementation (which is just a wrapper around numpy.tensordot).
+    based on code from Tijmen Tieleman's gnumpy http://www.cs.toronto.edu/~tijmen/gnumpy.html
+    """
+    if numpy.isscalar(axes):
+        # if 'axes' is a number of axes to multiply and sum over (trailing axes
+        # of a, leading axes of b), we can just reshape and use dot.         
+        outshape = tensor.concatenate([a.shape[:a.ndim - axes], b.shape[axes:]])
+        outndim = a.ndim + b.ndim - 2*axes
+        a_reshaped = a.reshape((tensor.prod(a.shape[:a.ndim - axes]), tensor.prod(a.shape[a.ndim - axes:])))
+        b_reshaped = b.reshape((tensor.prod(b.shape[:axes]), tensor.prod(b.shape[axes:])))
+        return tensor.dot(a_reshaped, b_reshaped).reshape(outshape, ndim=outndim)
+    elif len(axes) == 2:
+        # if 'axes' is a pair of axis lists, we first shuffle the axes of a and
+        # b to reduce this to the first case (note the recursion).
+        a_other, b_other = tuple(axes[0]), tuple(axes[1])
+        num_axes = len(a_other)
+        a_order = tuple(x for x in tuple(xrange(a.ndim)) if x not in a_other) + a_other
+        b_order = b_other + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_other)
+        a_shuffled = a.dimshuffle(a_order)
+        b_shuffled = b.dimshuffle(b_order)
+        return tensordot(a_shuffled, b_shuffled, num_axes)
+    else:
+        raise ValueError("Axes should be scalar valued or a list/tuple of len 2.")
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Useful mostly for test as the gpu op are inserted automatically...

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -870,6 +870,36 @@ def test_shared_cudandarray():
    a = cuda.shared_constructor(cuda.CudaNdarray.zeros((2,3)))
    assert isinstance(a.type, tcn.CudaNdarrayType)
+def test_tensordot_reshape():
+    '''Test that the tensordot implementation using dimshuffle, reshape and dot
+    gives the same results as the default (numpy) version'''
+    # define some tensors
+    a = numpy.arange(20, dtype=theano.config.floatX) / 20.0
+    b = numpy.arange(10, dtype=theano.config.floatX) / 10.0
+    c = numpy.arange(5, dtype=theano.config.floatX) / 5.0
+    d = numpy.arange(8, dtype=theano.config.floatX) / 8.0
+    tensor1 = numpy.tensordot(a, numpy.tensordot(b, numpy.tensordot(c, d, 0), 0), 0)
+    tensor2 = numpy.tensordot(c, numpy.tensordot(d, a, 0), 0)
+    tensor3 = tensor2.swapaxes(1, 2).swapaxes(0, 2) # d, a, c
+    x = T.tensor4('x')
+    y = T.tensor3('y')
+    # case 1: number of axes to sum over
+    default1 = theano.function([x,y], T.tensordot(x, y, 2))(tensor1, tensor2)
+    reshape1 = theano.function([x,y], B.tensordot(x, y, 2))(tensor1, tensor2)
+    assert numpy.allclose(default1, reshape1)
+    # case 2: axis pairs
+    default2 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3)
+    reshape2 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3), (1, 0)]))(tensor1, tensor3)
+    assert numpy.allclose(default2, reshape2)
+    default3 = theano.function([x,y], T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3)
+    reshape3 = theano.function([x,y], B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)]))(tensor1, tensor3)
+    assert numpy.allclose(default3, reshape3)
 class test_size(unittest.TestCase):