Merge pull request #986 from jlowin/dot_test

Ready for merge: Streamline dot & tensordot / Allow dot products of n-dimensional variables

Merge pull request #986 from jlowin/dot_test
d13064f7 · lamblin · 9880362c · 56bf2c46 · d13064f7 · d13064f7
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1202,19 +1202,94 @@ Linear Algebra
    :return: vector-vector outer product
-.. function:: tensordot(X, Y, axes=2)
+.. function:: tensordot(a, b, axes=2)
-    This is a symbolic standing for ``numpy.tensordot``.
+    Given two tensors a and b,tensordot computes a generalized dot product over
+    the provided axes. Theano's implementation reduces all expressions to
-    :param X: left term
+    matrix or vector dot products and is based on code from Tijmen Tieleman's
-    :param Y: right term
+    gnumpy (http://www.cs.toronto.edu/~tijmen/gnumpy.html).
-    :param axes: sum out these axes from X and Y.
-    :type X: symbolic tensor
+    :param a: the first tensor variable
-    :type Y: symbolic tensor
+    :type a: symbolic tensor
+    :param b: the second tensor variable
+    :type b: symbolic tensor
+    :param axes: an integer or array. If an integer, the number of axes
+                 to sum over. If an array, it must have two array
+                 elements containing the axes to sum over in each tensor.
+                 Note that the default value of 2 is not guaranteed to work
+                 for all values of a and b, and an error will be raised if
+                 that is the case. The reason for keeping the default is to
+                 maintain the same signature as numpy's tensordot function
+                 (and np.tensordot raises analogous errors for non-compatible
+                 inputs).
+                 If an integer i, it is converted to an array containing
+                 the last i dimensions of the first tensor and the first
+                 i dimensions of the second tensor:
+                     axes = [range(a.ndim - i, b.ndim), range(i)]
+                 If an array, its two elements must contain compatible axes
+                 of the two tensors. For example, [[1, 2], [2, 0]] means sum
+                 over the 2nd and 3rd axes of a and the 3rd and 1st axes of b.
+                 (Remember axes are zero-indexed!) The 2nd axis of a and the
+                 3rd axis of b must have the same shape; the same is true for
+                 the 3rd axis of a and the 1st axis of b.
+    :type axes: int or array-like of length 2
+    :returns: a tensor with shape equal to the concatenation of a's shape
+              (less any dimensions that were summed over) and b's shape
+              (less any dimensions that were summed over).
    :rtype: symbolic tensor
-    :type axes: see numpy.tensordot
-    :return: tensor product
+    It may be helpful to consider an example to see what tensordot does.
+    Theano's implementation is identical to NumPy's. Here a has shape (2, 3, 4)
+    and b has shape (5, 6, 4, 3). The axes to sum over are [[1, 2], [3, 2]] --
+    note that a.shape[1] == b.shape[3] and a.shape[2] == b.shape[2]; these axes
+    are compatible. The resulting tensor will have shape (2, 5, 6) -- the
+    dimensions that are not being summed:
+        a = np.random.random((2,3,4))
+        b = np.random.random((5,6,4,3))
+        #tensordot
+        c = np.tensordot(a, b, [[1,2],[3,2]])
+        #loop replicating tensordot
+        a0, a1, a2 = a.shape
+        b0, b1, _, _ = b.shape
+        cloop = np.zeros((a0,b0,b1))
+        #loop over non-summed indices -- these exist
+        #in the tensor product.
+        for i in range(a0):
+            for j in range(b0):
+                for k in range(b1):
+                    #loop over summed indices -- these don't exist
+                    #in the tensor product.
+                    for l in range(a1):
+                        for m in range(a2):
+                            cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
+        np.allclose(c, cloop) #true
+    This specific implementation avoids a loop by transposing a and b such that
+    the summed axes of a are last and the summed axes of b are first. The
+    resulting arrays are reshaped to 2 dimensions (or left as vectors, if
+    appropriate) and a matrix or vector dot product is taken. The result is
+    reshaped back to the required output dimensions.
+    In an extreme case, no axes may be specified. The resulting tensor
+    will have shape equal to the concatenation of the shapes of a and b:
+        c = np.tensordot(a, b, 0)
+        print(a.shape) #(2,3,4)
+        print(b.shape) #(5,6,4,3)
+        print(c.shape) #(2,3,4,5,6,4,3)
+    See the documentation of numpy.tensordot for more examples.
 .. function:: batched_dot(X, Y)

--- a/theano/gof/tests/test_compute_test_value.py
+++ b/theano/gof/tests/test_compute_test_value.py
@@ -243,8 +243,8 @@ class TestComputeTestValue(unittest.TestCase):
            except ValueError, e:
                # Get traceback
                tb = sys.exc_info()[2]
-                # Get frame info 3 layers up
+                # Get frame info 4 layers up
-                frame_info = traceback.extract_tb(tb)[-4]
+                frame_info = traceback.extract_tb(tb)[-5]
                # We should be in the "fx" function defined above
                assert os.path.split(frame_info[0])[1] == 'test_compute_test_value.py'
                assert frame_info[2] == 'fx'

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2845,54 +2845,6 @@ class GpuContiguous(GpuOp):
 gpu_contiguous = GpuContiguous()
-def tensordot(a, b, axes=2):
-    """
-    Implementation of tensordot that reduces to a regular matrix product.
-    This allows tensordot to be GPU accelerated, which isn't possible
-    with the default Theano implementation (which is just a wrapper
-    around numpy.tensordot). based on code from Tijmen Tieleman's gnumpy
-    http://www.cs.toronto.edu/~tijmen/gnumpy.html
-    """
-    if numpy.isscalar(axes):
-        # if 'axes' is a number of axes to multiply and sum over (trailing axes
-        # of a, leading axes of b), we can just reshape and use dot.
-        outshape = tensor.concatenate([a.shape[:a.ndim - axes],
-                                      b.shape[axes:]])
-        outndim = a.ndim + b.ndim - (2 * axes)
-        a_reshaped = a.reshape((tensor.prod(a.shape[:a.ndim - axes]),
-                                tensor.prod(a.shape[a.ndim - axes:])))
-        b_reshaped = b.reshape((tensor.prod(b.shape[:axes]),
-                                tensor.prod(b.shape[axes:])))
-        assert a_reshaped.ndim == 2
-        assert b_reshaped.ndim == 2
-        # We use _dot22 here because:
-        #   - we know that the number of dimensions will be 2
-        #   - it makes it possible for the computation to be moved to GPU
-        # When cuda.opt.local_gpu_tensordot is applied, it is too late
-        # for the usual blas optimizations to take place.
-        # This will change if we decide to get rid of tensor.tensordot,
-        # and always use this version.
-        return tensor.blas._dot22(a_reshaped, b_reshaped).reshape(
-                outshape, ndim=outndim)
-    elif len(axes) == 2:
-        # if 'axes' is a pair of axis lists, we first shuffle the axes of a and
-        # b to reduce this to the first case (note the recursion).
-        a_other, b_other = tuple(axes[0]), tuple(axes[1])
-        num_axes = len(a_other)
-        a_order = (tuple(x for x in tuple(xrange(a.ndim)) if x not in a_other)
-                + a_other)
-        b_order = (b_other
-                + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_other))
-        a_shuffled = a.dimshuffle(a_order)
-        b_shuffled = b.dimshuffle(b_order)
-        return tensordot(a_shuffled, b_shuffled, num_axes)
-    else:
-        raise ValueError(
-            "Axes should be scalar valued or a list/tuple of len 2.",
-            axes)
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Useful mostly for test as the gpu op are inserted automatically...
 def scalar(name=None, dtype=None):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -891,35 +891,6 @@ def local_gpu_print_op(node):
    return False
-@register_opt()
-@local_optimizer([tensor.TensorDot])
-def local_gpu_tensordot(node):
-    '''
-    T.tensordot(host_from_gpu) -> basic_ops.tensordot(host_from_gpu)
-    There is no Cuda Op for tensordot, however we can build a chain of
-    CPU Ops implementing tensordot. These Ops all have a GPU equivalent.
-    Note: applying this optimization at that stage is not ideal, because
-    all blas-related optimizations have already been applied.
-    However, if we want to apply it before the blas optimizations, then
-    we don't know which variables may end up on the GPU or not.
-    '''
-    if (isinstance(node.op, tensor.TensorDot) and
-            node.outputs[0].dtype == 'float32'):
-        x, y = node.inputs
-        if ((x.owner and
-                x.owner.op == host_from_gpu and
-                y.dtype == 'float32') or
-            (y.owner and
-                y.owner.op == host_from_gpu and
-                x.dtype == 'float32')):
-            axes = node.op.axes
-            out = tensordot(x, y, axes=axes)
-            return [out]
 def cast(x, dtype):
    stype = scal.Scalar(dtype)
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1129,54 +1129,6 @@ def test_shared_cudandarray():
    assert isinstance(a.type, tcn.CudaNdarrayType)
-class test_tensordot_reshape(unittest.TestCase):
-    '''Test alternative tensordot implementation.
-    Test that the tensordot implementation using dimshuffle, reshape and dot
-    gives the same results as the default (numpy) version.
-    '''
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-    def test1(self):
-        # define some tensors
-        tensor1 = self.rng.rand(20, 10, 5, 8).astype(theano.config.floatX)
-        tensor2 = self.rng.rand(5, 8, 20).astype(theano.config.floatX)
-        tensor3 = self.rng.rand(8, 20, 5).astype(theano.config.floatX)
-        x = T.tensor4('x')
-        y = T.tensor3('y')
-        # case 1: number of axes to sum over
-        default1 = theano.function([x, y], T.tensordot(x, y, 2))(
-                tensor1, tensor2)
-        reshape1 = theano.function([x, y], B.tensordot(x, y, 2))(
-                tensor1, tensor2)
-        assert numpy.allclose(default1, reshape1)
-        # case 2: axis pairs
-        default2 = theano.function(
-                [x, y],
-                T.tensordot(x, y, axes=[(0, 3), (1, 0)])
-                )(tensor1, tensor3)
-        reshape2 = theano.function(
-                [x, y],
-                B.tensordot(x, y, axes=[(0, 3), (1, 0)])
-                )(tensor1, tensor3)
-        assert numpy.allclose(default2, reshape2)
-        default3 = theano.function(
-                [x, y],
-                T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-                )(tensor1, tensor3)
-        reshape3 = theano.function(
-                [x, y],
-                B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-                )(tensor1, tensor3)
-        assert numpy.allclose(default3, reshape3)
 class test_size(unittest.TestCase):
    """

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -333,38 +333,6 @@ def test_elemwise_fusion():
      theano._asarray(numpy.random.rand(*shape), dtype='float32'))
-class test_local_gpu_tensordot(unittest.TestCase):
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-    def test_transfer(self):
-        tensor1 = self.rng.rand(20, 10, 5, 8).astype('float32')
-        tensor2 = self.rng.rand(5, 8, 20).astype('float32')
-        tensor3 = self.rng.rand(8, 20, 5).astype('float32')
-        x = tensor.ftensor4('x')
-        y = tensor.ftensor3('y')
-        tdot1 = tensor.tensordot(x, y, 2)
-        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
-        topo1 = f1.maker.fgraph.toposort()
-        assert topo1[-1].op == cuda.host_from_gpu
-        # Let DebugMode debug
-        f1(tensor1, tensor2)
-        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
-        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
-        topo2 = f2.maker.fgraph.toposort()
-        assert topo2[-1].op == cuda.host_from_gpu
-        f2(tensor1, tensor3)
-        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
-        topo3 = f3.maker.fgraph.toposort()
-        assert topo3[-1].op == cuda.host_from_gpu
-        f3(tensor1, tensor3)
 import theano.tests.test_ifelse

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1538,11 +1538,11 @@ class Dot22(GemmRelated):
 _dot22 = Dot22()
-@local_optimizer([T.dot])
+@local_optimizer([T._dot])
 def local_dot_to_dot22(node):
    # This works for tensor.outer too because basic.outer is a macro that
    # produces a dot(dimshuffle,dimshuffle) of form 4 below
-    if node.op != T.dot:
+    if not isinstance(node.op, T.Dot):
        return
    x, y = node.inputs

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -416,7 +416,8 @@ def local_lift_transpose_through_dot(node):
    if not (isinstance(node.op, T.DimShuffle)
            and node.op.new_order == (1, 0)):
        return False
-    if not (node.inputs[0].owner and node.inputs[0].owner.op == T.dot):
+    if not (node.inputs[0].owner
+            and isinstance(node.inputs[0].owner.op, T.Dot)):
        return False
    x, y = node.inputs[0].owner.inputs

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -14,7 +14,6 @@ from numpy import (arange, array, common_type, complex64, complex128, float32,
 from numpy.testing import assert_array_almost_equal
 #from numpy.testing import dec
 #from numpy.testing.noseclasses import KnownFailureTest
 from theano.tensor.blas import (_dot22, _dot22scalar, res_is_a, _as_scalar,
                                _is_real_matrix, _gemm_canonicalize,
                                _factor_canonicalized, Gemm, Gemv,
@@ -479,7 +478,7 @@ def just_gemm(i, o, ishapes=[(4, 3), (3, 5), (4, 5), (), ()],
                on_unused_input='ignore')
        nb_gemm = 0
        for node in f.maker.fgraph.apply_nodes:
-            if node.op == T.dot:
+            if isinstance(node.op, T.Dot):
                raise Failure('dot not changed to gemm_inplace in graph')
            if node.op == _dot22:
                raise Failure('_dot22 not changed to gemm_inplace in graph')
@@ -562,7 +561,7 @@ def test_gemm_opt_double_gemm():
        f = inplace_func([Param(ii, mutable=True) for ii in i], o,
                mode='FAST_RUN', on_unused_input='ignore')
        for node in f.maker.fgraph.apply_nodes:
-            if node.op == T.dot:
+            if isinstance(node.op, T.Dot):
                raise Failure('dot in graph')
            if node.op == _dot22:
                raise Failure('_dot22 in graph')
@@ -857,7 +856,9 @@ def test_dot22():
            if dtype1 == dtype2:
                assert _dot22 in [x.op for x in topo], (dtype1, dtype2)
            else:
-                assert T.dot in [x.op for x in topo], (dtype1, dtype2)
+                check = [isinstance(x.op, T.Dot) for x in topo]
+                from theano.gof.python25 import any
+                assert any(check), (dtype1, dtype2)
            rng = numpy.random.RandomState(unittest_tools.fetch_seed())
            def cmp(a_shp, b_shp):
@@ -919,8 +920,8 @@ def test_dot22scalar():
                            assert _dot22 in ops, (dtype1, dtype2,
                                                   dtype3, dtype4)
                        else:
-                            assert T.dot in ops, (dtype1, dtype2,
+                            check = [isinstance(o, T.Dot) for o in ops]
-                                                  dtype3, dtype4)
+                            assert any(check), (dtype1, dtype2, dtype3, dtype4)
                    def cmp(a_shp, b_shp, c_shp, sqr_shp=(5, 5)):
                        av = rng.uniform(size=a_shp).astype(dtype1)