Merge pull request #986 from jlowin/dot_test

Ready for merge: Streamline dot & tensordot / Allow dot products of n-dimensional variables

Merge pull request #986 from jlowin/dot_test
d13064f7 · lamblin · 9880362c · 56bf2c46 · d13064f7 · d13064f7
--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -1202,19 +1202,94 @@ Linear Algebra
    :return: vector-vector outer product
-.. function:: tensordot(X, Y, axes=2)
+.. function:: tensordot(a, b, axes=2)
-    This is a symbolic standing for ``numpy.tensordot``.
+    Given two tensors a and b,tensordot computes a generalized dot product over
+    the provided axes. Theano's implementation reduces all expressions to
-    :param X: left term
+    matrix or vector dot products and is based on code from Tijmen Tieleman's
-    :param Y: right term
+    gnumpy (http://www.cs.toronto.edu/~tijmen/gnumpy.html).
-    :param axes: sum out these axes from X and Y.
-    :type X: symbolic tensor
+    :param a: the first tensor variable
-    :type Y: symbolic tensor
+    :type a: symbolic tensor
+    :param b: the second tensor variable
+    :type b: symbolic tensor
+    :param axes: an integer or array. If an integer, the number of axes
+                 to sum over. If an array, it must have two array
+                 elements containing the axes to sum over in each tensor.
+                 Note that the default value of 2 is not guaranteed to work
+                 for all values of a and b, and an error will be raised if
+                 that is the case. The reason for keeping the default is to
+                 maintain the same signature as numpy's tensordot function
+                 (and np.tensordot raises analogous errors for non-compatible
+                 inputs).
+                 If an integer i, it is converted to an array containing
+                 the last i dimensions of the first tensor and the first
+                 i dimensions of the second tensor:
+                     axes = [range(a.ndim - i, b.ndim), range(i)]
+                 If an array, its two elements must contain compatible axes
+                 of the two tensors. For example, [[1, 2], [2, 0]] means sum
+                 over the 2nd and 3rd axes of a and the 3rd and 1st axes of b.
+                 (Remember axes are zero-indexed!) The 2nd axis of a and the
+                 3rd axis of b must have the same shape; the same is true for
+                 the 3rd axis of a and the 1st axis of b.
+    :type axes: int or array-like of length 2
+    :returns: a tensor with shape equal to the concatenation of a's shape
+              (less any dimensions that were summed over) and b's shape
+              (less any dimensions that were summed over).
    :rtype: symbolic tensor
-    :type axes: see numpy.tensordot
-    :return: tensor product
+    It may be helpful to consider an example to see what tensordot does.
+    Theano's implementation is identical to NumPy's. Here a has shape (2, 3, 4)
+    and b has shape (5, 6, 4, 3). The axes to sum over are [[1, 2], [3, 2]] --
+    note that a.shape[1] == b.shape[3] and a.shape[2] == b.shape[2]; these axes
+    are compatible. The resulting tensor will have shape (2, 5, 6) -- the
+    dimensions that are not being summed:
+        a = np.random.random((2,3,4))
+        b = np.random.random((5,6,4,3))
+        #tensordot
+        c = np.tensordot(a, b, [[1,2],[3,2]])
+        #loop replicating tensordot
+        a0, a1, a2 = a.shape
+        b0, b1, _, _ = b.shape
+        cloop = np.zeros((a0,b0,b1))
+        #loop over non-summed indices -- these exist
+        #in the tensor product.
+        for i in range(a0):
+            for j in range(b0):
+                for k in range(b1):
+                    #loop over summed indices -- these don't exist
+                    #in the tensor product.
+                    for l in range(a1):
+                        for m in range(a2):
+                            cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
+        np.allclose(c, cloop) #true
+    This specific implementation avoids a loop by transposing a and b such that
+    the summed axes of a are last and the summed axes of b are first. The
+    resulting arrays are reshaped to 2 dimensions (or left as vectors, if
+    appropriate) and a matrix or vector dot product is taken. The result is
+    reshaped back to the required output dimensions.
+    In an extreme case, no axes may be specified. The resulting tensor
+    will have shape equal to the concatenation of the shapes of a and b:
+        c = np.tensordot(a, b, 0)
+        print(a.shape) #(2,3,4)
+        print(b.shape) #(5,6,4,3)
+        print(c.shape) #(2,3,4,5,6,4,3)
+    See the documentation of numpy.tensordot for more examples.
 .. function:: batched_dot(X, Y)

--- a/theano/gof/tests/test_compute_test_value.py
+++ b/theano/gof/tests/test_compute_test_value.py
@@ -243,8 +243,8 @@ class TestComputeTestValue(unittest.TestCase):
            except ValueError, e:
                # Get traceback
                tb = sys.exc_info()[2]
-                # Get frame info 3 layers up
+                # Get frame info 4 layers up
-                frame_info = traceback.extract_tb(tb)[-4]
+                frame_info = traceback.extract_tb(tb)[-5]
                # We should be in the "fx" function defined above
                assert os.path.split(frame_info[0])[1] == 'test_compute_test_value.py'
                assert frame_info[2] == 'fx'

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2845,54 +2845,6 @@ class GpuContiguous(GpuOp):
 gpu_contiguous = GpuContiguous()
-def tensordot(a, b, axes=2):
-    """
-    Implementation of tensordot that reduces to a regular matrix product.
-    This allows tensordot to be GPU accelerated, which isn't possible
-    with the default Theano implementation (which is just a wrapper
-    around numpy.tensordot). based on code from Tijmen Tieleman's gnumpy
-    http://www.cs.toronto.edu/~tijmen/gnumpy.html
-    """
-    if numpy.isscalar(axes):
-        # if 'axes' is a number of axes to multiply and sum over (trailing axes
-        # of a, leading axes of b), we can just reshape and use dot.
-        outshape = tensor.concatenate([a.shape[:a.ndim - axes],
-                                      b.shape[axes:]])
-        outndim = a.ndim + b.ndim - (2 * axes)
-        a_reshaped = a.reshape((tensor.prod(a.shape[:a.ndim - axes]),
-                                tensor.prod(a.shape[a.ndim - axes:])))
-        b_reshaped = b.reshape((tensor.prod(b.shape[:axes]),
-                                tensor.prod(b.shape[axes:])))
-        assert a_reshaped.ndim == 2
-        assert b_reshaped.ndim == 2
-        # We use _dot22 here because:
-        #   - we know that the number of dimensions will be 2
-        #   - it makes it possible for the computation to be moved to GPU
-        # When cuda.opt.local_gpu_tensordot is applied, it is too late
-        # for the usual blas optimizations to take place.
-        # This will change if we decide to get rid of tensor.tensordot,
-        # and always use this version.
-        return tensor.blas._dot22(a_reshaped, b_reshaped).reshape(
-                outshape, ndim=outndim)
-    elif len(axes) == 2:
-        # if 'axes' is a pair of axis lists, we first shuffle the axes of a and
-        # b to reduce this to the first case (note the recursion).
-        a_other, b_other = tuple(axes[0]), tuple(axes[1])
-        num_axes = len(a_other)
-        a_order = (tuple(x for x in tuple(xrange(a.ndim)) if x not in a_other)
-                + a_other)
-        b_order = (b_other
-                + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_other))
-        a_shuffled = a.dimshuffle(a_order)
-        b_shuffled = b.dimshuffle(b_order)
-        return tensordot(a_shuffled, b_shuffled, num_axes)
-    else:
-        raise ValueError(
-            "Axes should be scalar valued or a list/tuple of len 2.",
-            axes)
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Useful mostly for test as the gpu op are inserted automatically...
 def scalar(name=None, dtype=None):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -891,35 +891,6 @@ def local_gpu_print_op(node):
    return False
-@register_opt()
-@local_optimizer([tensor.TensorDot])
-def local_gpu_tensordot(node):
-    '''
-    T.tensordot(host_from_gpu) -> basic_ops.tensordot(host_from_gpu)
-    There is no Cuda Op for tensordot, however we can build a chain of
-    CPU Ops implementing tensordot. These Ops all have a GPU equivalent.
-    Note: applying this optimization at that stage is not ideal, because
-    all blas-related optimizations have already been applied.
-    However, if we want to apply it before the blas optimizations, then
-    we don't know which variables may end up on the GPU or not.
-    '''
-    if (isinstance(node.op, tensor.TensorDot) and
-            node.outputs[0].dtype == 'float32'):
-        x, y = node.inputs
-        if ((x.owner and
-                x.owner.op == host_from_gpu and
-                y.dtype == 'float32') or
-            (y.owner and
-                y.owner.op == host_from_gpu and
-                x.dtype == 'float32')):
-            axes = node.op.axes
-            out = tensordot(x, y, axes=axes)
-            return [out]
 def cast(x, dtype):
    stype = scal.Scalar(dtype)
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1129,54 +1129,6 @@ def test_shared_cudandarray():
    assert isinstance(a.type, tcn.CudaNdarrayType)
-class test_tensordot_reshape(unittest.TestCase):
-    '''Test alternative tensordot implementation.
-    Test that the tensordot implementation using dimshuffle, reshape and dot
-    gives the same results as the default (numpy) version.
-    '''
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-    def test1(self):
-        # define some tensors
-        tensor1 = self.rng.rand(20, 10, 5, 8).astype(theano.config.floatX)
-        tensor2 = self.rng.rand(5, 8, 20).astype(theano.config.floatX)
-        tensor3 = self.rng.rand(8, 20, 5).astype(theano.config.floatX)
-        x = T.tensor4('x')
-        y = T.tensor3('y')
-        # case 1: number of axes to sum over
-        default1 = theano.function([x, y], T.tensordot(x, y, 2))(
-                tensor1, tensor2)
-        reshape1 = theano.function([x, y], B.tensordot(x, y, 2))(
-                tensor1, tensor2)
-        assert numpy.allclose(default1, reshape1)
-        # case 2: axis pairs
-        default2 = theano.function(
-                [x, y],
-                T.tensordot(x, y, axes=[(0, 3), (1, 0)])
-                )(tensor1, tensor3)
-        reshape2 = theano.function(
-                [x, y],
-                B.tensordot(x, y, axes=[(0, 3), (1, 0)])
-                )(tensor1, tensor3)
-        assert numpy.allclose(default2, reshape2)
-        default3 = theano.function(
-                [x, y],
-                T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-                )(tensor1, tensor3)
-        reshape3 = theano.function(
-                [x, y],
-                B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-                )(tensor1, tensor3)
-        assert numpy.allclose(default3, reshape3)
 class test_size(unittest.TestCase):
    """

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -333,38 +333,6 @@ def test_elemwise_fusion():
      theano._asarray(numpy.random.rand(*shape), dtype='float32'))
-class test_local_gpu_tensordot(unittest.TestCase):
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-    def test_transfer(self):
-        tensor1 = self.rng.rand(20, 10, 5, 8).astype('float32')
-        tensor2 = self.rng.rand(5, 8, 20).astype('float32')
-        tensor3 = self.rng.rand(8, 20, 5).astype('float32')
-        x = tensor.ftensor4('x')
-        y = tensor.ftensor3('y')
-        tdot1 = tensor.tensordot(x, y, 2)
-        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
-        topo1 = f1.maker.fgraph.toposort()
-        assert topo1[-1].op == cuda.host_from_gpu
-        # Let DebugMode debug
-        f1(tensor1, tensor2)
-        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
-        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
-        topo2 = f2.maker.fgraph.toposort()
-        assert topo2[-1].op == cuda.host_from_gpu
-        f2(tensor1, tensor3)
-        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
-        topo3 = f3.maker.fgraph.toposort()
-        assert topo3[-1].op == cuda.host_from_gpu
-        f3(tensor1, tensor3)
 import theano.tests.test_ifelse

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -6896,14 +6896,19 @@ def take(a, indices, axis=None, mode='raise'):
 class Dot(Op):
-    """Compute matrix-matrix, matrix-vector products and vector inner-products.
+    """
+    Computes the dot product of two variables. For two matrices, this is
+    equivalent to matrix multiplication. For two vectors, this is the inner
+    product.
+    :note: matrix-matrix products are sometimes optimized to Dot22 or Gemm ops.
+    (see tensor.blas)
-    :note: matrix-matrix products are sometimes optimized to Dot22 ops
+    :note: vector-vector products are sometimes optimized to Ger or CGer.  (see
-        (see tensor.blas)
+    tensor.blas)
-    :note: non matrix-matrix products (including matrix-vector
+    :note: matrix-vector products are sometimes optimized to Gemv, CGemv (see
-        products) are handled by numpy.  Ensure that you have linked numpy
+    tensor.blas)
-        with a fast BLAS.
    """
@@ -6919,51 +6924,27 @@ class Dot(Op):
    def make_node(self, *inputs):
        inputs = map(as_tensor_variable, inputs)
-        numpy_semantics = 0
+        if len(inputs) != 2:
-        if numpy_semantics:
+            raise TypeError(
-            # numpy defines dot for tensor pairs with any rank
+                'theano.tensor.Dot: 2 arguments required, %d given ' %
-            if len(inputs) != 2:
+                len(inputs))
-                raise TypeError(
+        if inputs[0].ndim not in (1, 2):
-                    "Wrong number of inputs for %s (got %i, expected 2)" %
+            raise TypeError(
-                    self)
+                'theano.tensor.Dot: input 0 (0-indexed) must have ndim of '
-            i_broadcastables = [input.type.broadcastable for input in inputs]
+                '1 or 2, %d given. Consider calling theano.tensor.dot '
-            bx, by = i_broadcastables
+                'instead.' % inputs[0].ndim)
-            if len(bx) == 0:     # x is a scalar
+        if inputs[1].ndim not in (1, 2):
-                bz = by
+            raise TypeError(
-            else:
+                'theano.tensor.Dot: input 1 (0-indexed) must have ndim of '
-                if len(by) >= 2:  # y is a matrix or tensor
+                '1 or 2, %d given. Consider calling theano.tensor.dot '
-                    bz = bx[:-1] + by[:-2] + by[-1:]
+                'instead.' % inputs[1].ndim)
-                elif len(by) == 1:  # y is vector
-                    bz = bx[:-1]
-                else:  # y is a scalar
-                    bz = bx
-        else:
-            if len(inputs) != 2:
-                raise TypeError(
-                    'theanor.tensor.Dot: 2 arguments required, %d given ' %
-                    len(inputs))
-            x, y = inputs
-            nx = x.type.ndim
-            ny = y.type.ndim
-            if nx not in (1, 2):
+        i_broadcastables = [input.type.broadcastable for input in inputs]
-                raise TypeError(
+        bx, by = i_broadcastables
-                    ('dot supports matrix and vector args: email theano-dev '
+        if len(by) == 2:  # y is a matrix
-                    'about enabling numpy dot semantics if you want them'), x)
+            bz = bx[:-1] + by[-1:]
-            if ny not in (1, 2):
+        elif len(by) == 1:  # y is vector
-                raise TypeError(
+            bz = bx[:-1]
-                    ('dot supports matrix and vector args: email theano-dev '
-                    'about enabling numpy dot semantics if you want them'), y)
-            if nx == 2 and ny == 2:
-                bz = [x.type.broadcastable[0], y.type.broadcastable[1]]
-            elif nx == 1 and ny == 2:
-                bz = [y.type.broadcastable[1]]
-            elif nx == 2 and ny == 1:
-                bz = [x.type.broadcastable[0]]
-            else:
-                bz = []
        i_dtypes = [input.type.dtype for input in inputs]
        outputs = [tensor(scal.upcast(*i_dtypes), bz)]
@@ -6995,14 +6976,29 @@ class Dot(Op):
        x, y = inp
        gz, = grads
-        if gz.type.ndim == 0:
+        xdim, ydim, gdim = x.type.ndim, y.type.ndim, gz.type.ndim
-            rval = gz * y, gz * x
-        elif x.type.ndim == 1 and y.type.ndim > 1:
+        #grad is scalar, so x is vector and y is vector
-            rval = dot(gz, y.T), outer(x.T, gz)
+        if gdim == 0:
-        elif x.type.ndim > 1 and y.type.ndim == 1:
+            xgrad = gz * y
-            rval = outer(gz, y.T), dot(x.T, gz)
+            ygrad = gz * x
-        else:
-            rval = dot(gz, y.T), dot(x.T, gz)
+        #x is vector, y is matrix, grad is vector
+        elif xdim == 1 and ydim == 2:
+            xgrad = dot(gz, y.T)
+            ygrad = outer(x.T, gz)
+        #x is matrix, y is vector, grad is vector
+        elif xdim == 2 and ydim == 1:
+            xgrad = outer(gz, y.T)
+            ygrad = dot(x.T, gz)
+        #x is matrix, y is matrix, grad is matrix
+        elif xdim == ydim == 2:
+            xgrad = dot(gz, y.T)
+            ygrad = dot(x.T, gz)
+        rval = xgrad, ygrad
        for elem in rval:
            assert elem.dtype.find('float') != -1
@@ -7070,224 +7066,264 @@ class Dot(Op):
    def infer_shape(self, node, shapes):
        xshp, yshp = shapes
        x, y = node.inputs
-        if x.ndim == 2 and y.ndim == 2:
-            return [(xshp[0], yshp[1])]
+        # vector / vector
-        if x.ndim == 1 and y.ndim == 2:
-            return [(yshp[1],)]
-        if x.ndim == 2 and y.ndim == 1:
-            return [(xshp[0],)]
        if x.ndim == 1 and y.ndim == 1:
            return [()]
+        # matrix / vector
+        if x.ndim == 2 and y.ndim == 1:
+            return [xshp[:-1]]
+        # vector / matrix
+        if x.ndim == 1 and y.ndim == 2:
+            return [yshp[-1:]]
+        # matrix / matrix
+        if x.ndim == 2 and y.ndim == 2:
+            return [xshp[:-1] + yshp[-1:]]
        raise NotImplementedError()
    def __str__(self):
        return "dot"
-dot = Dot()
-pprint.assign(dot, printing.OperatorPrinter(printing.special['middle_dot'],
+_dot = Dot()
+pprint.assign(_dot, printing.OperatorPrinter(printing.special['middle_dot'],
                                            -1, 'left'))
+def dot(a, b):
-#########################
-# Linalg : TensorDot
-#########################
-class TensorDotGrad(Op):
-    def __init__(self, axes):
-        self.axes = TensorDot.parse_axes(axes)
-        if isinstance(self.axes, (tuple, list)) and len(self.axes) == 2:
-            # The current perform don't implement correctly those cases
-            for i in range(len(self.axes[0]) - 1):
-                if self.axes[0][i] > self.axes[0][i + 1]:
-                    raise NotImplementedError()
-                if self.axes[1][i] > self.axes[1][i + 1]:
-                    raise NotImplementedError()
-    def __eq__(self, other):
-        return type(self) == type(other) and self.axes == other.axes
-    def __hash__(self):
-        return hashtype(self) ^ hash(self.axes) ^ 89234
-    def make_node(self, x, y, gz):
-        assert isinstance(x, Variable)
-        assert isinstance(y, Variable)
-        assert isinstance(gz, Variable)
-        gx = tensor(dtype=scal.upcast(gz.dtype, y.dtype),
-                    broadcastable=x.broadcastable)
-        gy = tensor(dtype=scal.upcast(x.dtype, gz.dtype),
-                    broadcastable=y.broadcastable)
-        op = self
-        if isinstance(self.axes, int):
-            axes = [range(x.ndim - self.axes, x.ndim), range(self.axes)]
-            op = TensorDotGrad(axes)
-        return Apply(op, [x, y, gz], [gx, gy])
-    def perform(self, node, inp, out):
-        x, y, gz = inp
-        gx, gy = out
-        sum_over_y = range(y.ndim)
-        [sum_over_y.remove(q) for q in self.axes[1]]
-        sum_over_x = range(x.ndim)
-        [sum_over_x.remove(q) for q in self.axes[0]]
-        tdot_axes = [range(x.ndim - len(self.axes[0]), gz.ndim), sum_over_y]
-        _gx = numpy.tensordot(gz, y, tdot_axes)
-        idx = numpy.hstack((sum_over_x, self.axes[0]))
-        newshapex = numpy.zeros(x.ndim)
-        newshapex[[newpos for newpos in idx]] = range(x.ndim)
-        gx[0] = numpy.transpose(_gx, newshapex)
-        tdot_axes = [sum_over_x, range(x.ndim - len(self.axes[0]))]
-        _gy = numpy.tensordot(x, gz, tdot_axes)
-        idy = numpy.hstack((self.axes[1], sum_over_y))
-        newshapey = numpy.zeros(y.ndim)
-        newshapey[[newpos for newpos in idy]] = range(y.ndim)
-        gy[0] = numpy.transpose(_gy, newshapey)
-        assert gy[0].shape == y.shape
-        assert gx[0].shape == x.shape
-    def infer_shape(self, node, in_shapes):
-        return in_shapes[:2]
-tensordot_grad = TensorDotGrad
-class TensorDot(Op):
-    """Compute tensor-tensor products over the given axes.
-    See numpy documentation for details.
-    (http://docs.scipy.org/doc/numpy/reference/generated/numpy.tensordot.html)
    """
+    Computes the dot product of two variables. For two matrices, this is
+    equivalent to matrix multiplication. For two vectors, this is the inner
+    product. When one variable is a scalar, this is like elementwise
+    multiplication.  For N dimensions, this is a sum product over the last axis
+    of the first array and the second-to-last axis of the second array:
-    @classmethod
+        dot(a, b)[i,j,k,m] = sum(a[i,j,:] * b[k,:,m])
-    def parse_axes(cls, axes):
-        if not numpy.isscalar(axes) and len(axes) != 2:
+    Note that this dot function does one of three things, in the following
-            raise ValueError("Axes should be scalar valued or a list/tuple of "
+    sequence:
-                             "len 2.")
-        if isinstance(axes, (list, tuple)):
+        1.  If either a or b is scalar, it returns the elementwise product
-            axes_out = []
+            without calling the Theano Dot op.
-            # cast axes[0] and axes[1] to tuples
-            for i, a in enumerate(axes):
-                if numpy.isscalar(a):
-                    axes_out.append((a,))
-                else:
-                    axes_out.append(tuple(a))
-            # these should be of same length
+        2.  If either a or b has more than 2 dimensions, it calls Theano's
-            if len(axes_out[0]) != len(axes_out[1]):
+            tensordot function with appropriate axes. The tensordot function
-                raise ValueError("Elements of the axes list/tuple need to be "
+            expresses high-dimensional dot products in terms of 2D matrix
-                                 "of the same size.")
+            multiplications, so it may be possible to futherize optimize for
+            performance.
-            axes = tuple(axes_out)
+        3.  If both a and b have either 1 or 2 dimensions, it calls Theano's
+            Dot op on a and b.
-        return axes
+    :note: matrix-matrix products are sometimes optimized to Dot22 or Gemm ops.
+    (see tensor.blas)
-    def __init__(self, axes):
+    :note: vector-vector products are sometimes optimized to Ger or CGer.  (see
-        self.axes = self.parse_axes(axes)
+    tensor.blas)
-    def __eq__(self, other):
+    :note: matrix-vector products are sometimes optimized to Gemv, CGemv (see
-        return type(self) == type(other) and self.axes == other.axes
+    tensor.blas)
-    def __hash__(self):
-        return hashtype(self) ^ hash(self.axes) ^ 89234
-    def make_node(self, x, y):
+    """
-        op = self
+    a, b = as_tensor_variable(a), as_tensor_variable(b)
-        if isinstance(self.axes, int):
-            axes = [range(x.ndim - self.axes, x.ndim), range(self.axes)]
-            op = TensorDot(axes)
-        axesdim = numpy.size(op.axes) / 2
+    if a.ndim == 0 or b.ndim == 0:
+        return a * b
+    elif a.ndim > 2 or b.ndim > 2:
+        return tensordot(a, b, [[a.ndim - 1], [numpy.maximum(0, b.ndim - 2)]])
+    else:
+        return _dot(a, b)
-        x, y = map(as_tensor_variable, [x, y])
-        if axesdim > x.type.ndim or axesdim > y.type.ndim:
-            raise TypeError('Cannot sum over more dimensions than input. '
-                            '%i > %i,%i' %
-                            (axesdim, x.type.ndim, y.type.ndim))
-        outdim = x.type.ndim + y.type.ndim - 2 * axesdim
+#########################
-        output = tensor(dtype=scal.upcast(x.dtype, y.dtype),
+# Linalg : TensorDot
-                        broadcastable=[False] * outdim)
+#########################
-        return Apply(op, inputs=[x, y], outputs=[output, ])
-    def perform(self, node, inp, out):
+def tensordot(a, b, axes = 2):
-        x, y = inp
+    """
-        z, = out
+    Given two tensors a and b,tensordot computes a generalized dot product over
+    the provided axes. Theano's implementation reduces all expressions to
+    matrix or vector dot products and is based on code from Tijmen Tieleman's
+    gnumpy (http://www.cs.toronto.edu/~tijmen/gnumpy.html).
+    :param a: the first tensor variable
+    :type a: symbolic tensor
+    :param b: the second tensor variable
+    :type b: symbolic tensor
+    :param axes: an integer or array. If an integer, the number of axes
+                 to sum over. If an array, it must have two array
+                 elements containing the axes to sum over in each tensor.
+                 Note that the default value of 2 is not guaranteed to work
+                 for all values of a and b, and an error will be raised if
+                 that is the case. The reason for keeping the default is to
+                 maintain the same signature as numpy's tensordot function
+                 (and np.tensordot raises analogous errors for non-compatible
+                 inputs).
+                 If an integer i, it is converted to an array containing
+                 the last i dimensions of the first tensor and the first
+                 i dimensions of the second tensor:
+                     axes = [range(a.ndim - i, b.ndim), range(i)]
+                 If an array, its two elements must contain compatible axes
+                 of the two tensors. For example, [[1, 2], [2, 0]] means sum
+                 over the 2nd and 3rd axes of a and the 3rd and 1st axes of b.
+                 (Remember axes are zero-indexed!) The 2nd axis of a and the
+                 3rd axis of b must have the same shape; the same is true for
+                 the 3rd axis of a and the 1st axis of b.
+    :type axes: int or array-like of length 2
+    :returns: a tensor with shape equal to the concatenation of a's shape
+              (less any dimensions that were summed over) and b's shape
+              (less any dimensions that were summed over).
+    :rtype: symbolic tensor
+    It may be helpful to consider an example to see what tensordot does.
+    Theano's implementation is identical to NumPy's. Here a has shape (2, 3, 4)
+    and b has shape (5, 6, 4, 3). The axes to sum over are [[1, 2], [3, 2]] --
+    note that a.shape[1] == b.shape[3] and a.shape[2] == b.shape[2]; these axes
+    are compatible. The resulting tensor will have shape (2, 5, 6) -- the
+    dimensions that are not being summed:
+        a = np.random.random((2,3,4))
+        b = np.random.random((5,6,4,3))
+        #tensordot
+        c = np.tensordot(a, b, [[1,2],[3,2]])
+        #loop replicating tensordot
+        a0, a1, a2 = a.shape
+        b0, b1, _, _ = b.shape
+        cloop = np.zeros((a0,b0,b1))
+        #loop over non-summed indices -- these exist
+        #in the tensor product.
+        for i in range(a0):
+            for j in range(b0):
+                for k in range(b1):
+                    #loop over summed indices -- these don't exist
+                    #in the tensor product.
+                    for l in range(a1):
+                        for m in range(a2):
+                            cloop[i,j,k] += a[i,l,m] * b[j,k,m,l]
+        np.allclose(c, cloop) #true
+    This specific implementation avoids a loop by transposing a and b such that
+    the summed axes of a are last and the summed axes of b are first. The
+    resulting arrays are reshaped to 2 dimensions (or left as vectors, if
+    appropriate) and a matrix or vector dot product is taken. The result is
+    reshaped back to the required output dimensions.
+    In an extreme case, no axes may be specified. The resulting tensor
+    will have shape equal to the concatenation of the shapes of a and b:
+        c = np.tensordot(a, b, 0)
+        print(a.shape) #(2,3,4)
+        print(b.shape) #(5,6,4,3)
+        print(c.shape) #(2,3,4,5,6,4,3)
+    See the documentation of numpy.tensordot for more examples.
+    """
+    a, b = as_tensor_variable(a), as_tensor_variable(b)
+    # axes must be a scalar or list/tuple of length 2
+    if not numpy.isscalar(axes) and len(axes) != 2:
+        raise ValueError('Axes should be an integer or a '
+                         'list/tuple of len 2 (%s was provided)' % repr(axes))
+    # if 'axes' is a number of axes to multiply and sum over (trailing axes
+    # of a, leading axes of b), we can just reshape and use dot.
+    elif numpy.isscalar(axes):
+        axes = int(axes)
+        # check if axes is valid given the dimension of a and b
+        if axes > a.ndim:
+            raise ValueError('axes can not be larger than the dimension of '
+                    'a (a.ndim=%i, axes=%i)' % (a.ndim, axes))
+        if axes > b.ndim:
+            raise ValueError('axes can not be larger than than the dimension '
+                             'of b (b.ndim=%i, axes=%i)' % (b.ndim, axes))
+        outshape = concatenate([a.shape[:a.ndim - axes], b.shape[axes:]])
+        outndim = a.ndim + b.ndim - (2 * axes)
+        a_shape_0 = b_shape_0 = a_shape_1 = b_shape_1 = 1
+        for s0 in range(a.ndim - axes):
+            a_shape_0 *= a.shape[s0]
+        for s0 in range(axes):
+            b_shape_0 *= b.shape[s0]
+        for s1 in range(a.ndim - axes, a.ndim):
+            a_shape_1 *= a.shape[s1]
+        for s1 in range(axes, b.ndim):
+            b_shape_1 *= b.shape[s1]
+        a_reshaped = a.reshape((a_shape_0, a_shape_1), ndim = 2)
+        b_reshaped = b.reshape((b_shape_0, b_shape_1), ndim = 2)
+        return _dot(a_reshaped, b_reshaped).reshape(outshape, outndim)
+    # if 'axes' is a list, transpose a and b such that the summed axes of a
+    # are last and the summed axes of b are first.
+    else:
+        #get first axis element as a tuple
        try:
-            z[0] = numpy.asarray(numpy.tensordot(x, y, self.axes))
+            a_axes = tuple(axes[0])
-        except ValueError, e:
+        except TypeError:
-            # The error raised by numpy has no shape information, we mean to
+            a_axes = tuple([axes[0]])
-            # add that.
-            e.args = e.args + (x.shape, y.shape, self.axes)
-            raise
-    def infer_shape(self, node, in_shapes):
-        shape_x, shape_y = in_shapes
-        out_shape = []
-        if isinstance(self.axes, (list, tuple)):
-            iter = (i for i in range(len(shape_x)) if i not in self.axes[0])
-            for i in iter:
-                out_shape.append(shape_x[i])
-            iter = (i for i in range(len(shape_y)) if i not in self.axes[1])
-            for i in iter:
-                out_shape.append(shape_y[i])
-        else:
-            out_shape = list(shape_x)[shape_x.ndim - self.axes] + \
-                        list(shape_y)[shape_y.ndim - self.axes, shape_y.ndim]
-        return [out_shape]
-    def grad(self, inp, grads):
-        x, y = inp
-        gz, = grads
-        gx, gy = tensordot_grad(self.axes)(x, y, gz)
-        return [gx, gy]
-    def __str__(self):
-        return "tensordot"
-def tensordot(x, y=None, axes=2):
-    if y is None:
-        raise NotImplementedError(
-                'The interface to tensordot has changed from '
-                'tensor.tensordot(axes)(x,y) to tensor.tensordot(x,y,axes). '
-                'Please modify your code accordingly.')
-    if x.ndim == 0 or y.ndim == 0:
-        raise ValueError('Cannot perform tensordot of 0-d inputs.')
-    axes = TensorDot.parse_axes(axes)
-    # check whether axes is valid given the dimensions of x and y
-    if numpy.isscalar(axes):
-        if axes >= x.ndim or axes >= y.ndim:
-            raise ValueError('axes should be smaller than the dimension of '\
-                    'x and y (x.ndim=%i, y.ndim=%i)' % (x.ndim, y.ndim))
-    elif isinstance(axes, (list, tuple)):
-        if isinstance(axes[0], (list, tuple)) and \
-           (len(axes[0]) > x.ndim or (numpy.array(axes[0]) >= x.ndim).any()):
-            raise ValueError('axes[0] should be array_like, of length smaller'\
-                    ' than the dimension of x (x.ndim=%i, len(axes[0])=%i).' %
-                    (x.ndim, len(axes[0])))
-        if isinstance(axes[1], (list, tuple)) and \
-           (len(axes[1]) > y.ndim or (numpy.array(axes[1]) >= y.ndim).any()):
-            raise ValueError('axes[1] should be array_like, of length smaller'\
-                    'than the dimension of y (y.ndim=%i, len(axes[1])=%i).' %
-                    (y.ndim, len(axes[1])))
-    if not hasattr(tensordot, 'op'):
-        tensordot.op = {}
-    if axes not in tensordot.op:
-        tensordot.op[axes] = TensorDot(axes)
-    return tensordot.op[axes](x, y)
-# TODO: tensordot should be function as described in rst docs.
+        #get second axis element as a tuple
+        try:
+            b_axes = tuple(axes[1])
+        except TypeError:
+            b_axes = tuple([axes[1]])
+        # the two axes lists must have the same length
+        if len(a_axes) != len(b_axes):
+            raise ValueError('Axes elements must have the same length.')
+        # check that there aren't more axes than a has dimensions
+        if len(a_axes) > a.ndim:
+            raise ValueError('axes[0] should be array_like with length '
+                             'less than the dimensions of a '
+                             '(a.ndim=%i, len(axes[0])=%i).' %
+                             (a.ndim, len(a_axes)))
+        # check that a_axes doesn't contain an axis greater than or equal to
+        # a's dimensions. also check if len > 0 so numpy.max won't raise an
+        # error.
+        if len(a_axes) > 0 and numpy.max(numpy.array(a_axes)) >= a.ndim:
+            raise ValueError('axes[0] contains dimensions greater than or '
+                             'equal to a.ndim (a.ndim=%i, max(axes[0])=%i).' %
+                             (a.ndim, numpy.max(numpy.array(a_axes))))
+        # check that there aren't more axes than b has dimensions
+        if len(b_axes) > b.ndim:
+            raise ValueError('axes[1] should be array_like, of length '
+                             'smaller than the dimension of b '
+                             '(a.ndim=%i, len(axes[0])=%i).' %
+                             (b.ndim, len(b_axes)))
+        # check that b_axes doesn't contain an axis greater than or equal to
+        # b's dimensions. also check if len > 0 so numpy.max won't raise an
+        # error.
+        if len(b_axes) > 0 and numpy.max(numpy.array(b_axes)) >= b.ndim:
+            raise ValueError('axes[1] contains dimensions greater than or '
+                             'equal to b.ndim (b.ndim=%i, max(axes[1])=%i).' %
+                             (b.ndim, numpy.max(numpy.array(b_axes))))
+        a_order = (tuple(x for x in tuple(xrange(a.ndim)) if x not in a_axes)
+                + a_axes)
+        b_order = (b_axes
+                + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_axes))
+        a_shuffled = a.dimshuffle(a_order)
+        b_shuffled = b.dimshuffle(b_order)
+        # now that a and b are in the right order, call tensordot recursively
+        return tensordot(a_shuffled, b_shuffled, len(a_axes))
 def outer(x, y):

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -1538,11 +1538,11 @@ class Dot22(GemmRelated):
 _dot22 = Dot22()
-@local_optimizer([T.dot])
+@local_optimizer([T._dot])
 def local_dot_to_dot22(node):
    # This works for tensor.outer too because basic.outer is a macro that
    # produces a dot(dimshuffle,dimshuffle) of form 4 below
-    if node.op != T.dot:
+    if not isinstance(node.op, T.Dot):
        return
    x, y = node.inputs

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -416,7 +416,8 @@ def local_lift_transpose_through_dot(node):
    if not (isinstance(node.op, T.DimShuffle)
            and node.op.new_order == (1, 0)):
        return False
-    if not (node.inputs[0].owner and node.inputs[0].owner.op == T.dot):
+    if not (node.inputs[0].owner
+            and isinstance(node.inputs[0].owner.op, T.Dot)):
        return False
    x, y = node.inputs[0].owner.inputs

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -32,11 +32,11 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
        tensor4, permute_row_elements, Flatten, fmatrix, fscalars, grad,
        inplace, iscalar, matrix, minimum, matrices, maximum, mul, neq,
        Reshape, row, scalar, scalars, second, smallest, stack, sub, Tensor,
-        tensor_copy, tensordot, tensordot_grad,  TensorType, unbroadcast,
+        tensor_copy, tensordot, TensorType, unbroadcast,
        var, Join, shape, MaxAndArgmax, lscalar, zvector, exp,
        get_scalar_constant_value, ivector, reshape, scalar_from_tensor, scal,
        iscalars, arange,  dscalars, fvector, imatrix, numeric_grad,
-        opt, ComplexError, TensorDot, lvector, true_div, max, min, Split, roll,
+        opt, ComplexError, lvector, true_div, max, min, Split, roll,
        tile, patternbroadcast, Eye, Shape, Default, Dot, PermuteRowElements,
        ScalarFromTensor, TensorFromScalar, dtensor4, Rebroadcast, Alloc,
        dtensor3, SpecifyShape, Mean, IncSubtensor, AdvancedIncSubtensor1,
@@ -4274,20 +4274,59 @@ class t_dot(unittest.TestCase):
        self.assertTrue(tz.shape == nz.shape)
        self.assertTrue(_approx_eq(nz, tz))
-    #def test_dot_0d_0d(self): self.cmp_dot(1.1, 2.2)
+    def test_Op_dims(self):
-    #def test_dot_0d_1d(self): self.cmp_dot(1.1, rand(5))
+        # _dot is a Dot op instance
-    #def test_dot_0d_2d(self): self.cmp_dot(3.0, rand(6,7))
+        _dot = theano.tensor.basic._dot
-    #def test_dot_0d_3d(self): self.cmp_dot(3.0, rand(8,6,7))
+        d0 = scalar()
-    #def test_dot_1d_0d(self): self.cmp_dot(rand(5), 1.1 )
+        d1 = vector()
+        d2 = matrix()
+        d3 = tensor3()
+        self.assertRaises(TypeError, _dot, d0, d0)
+        self.assertRaises(TypeError, _dot, d0, d1)
+        self.assertRaises(TypeError, _dot, d0, d2)
+        self.assertRaises(TypeError, _dot, d0, d3)
+        self.assertRaises(TypeError, _dot, d1, d0)
+        _dot(d1, d1)
+        _dot(d1, d2)
+        self.assertRaises(TypeError, _dot, d1, d3)
+        self.assertRaises(TypeError, _dot, d2, d0)
+        _dot(d2, d1)
+        _dot(d2, d2)
+        self.assertRaises(TypeError, _dot, d2, d3)
+        self.assertRaises(TypeError, _dot, d3, d0)
+        self.assertRaises(TypeError, _dot, d3, d1)
+        self.assertRaises(TypeError, _dot, d3, d2)
+        self.assertRaises(TypeError, _dot, d3, d3)
+    def test_dot_0d_0d(self):
+        self.cmp_dot(1.1, 2.2)
+    def test_dot_0d_1d(self):
+        self.cmp_dot(1.1, rand(5))
+    def test_dot_0d_2d(self):
+        self.cmp_dot(3.0, rand(6,7))
+    def test_dot_0d_3d(self):
+        self.cmp_dot(3.0, rand(8,6,7))
+    def test_dot_1d_0d(self):
+        self.cmp_dot(rand(5), 1.1 )
    def test_dot_1d_1d(self):
        self.cmp_dot(rand(5), rand(5))
    def test_dot_1d0_1d0(self):
        self.cmp_dot(rand(0), rand(0))
    #numpy return matrix not aligned...
-    #def test_dot_1d_1d0(self): self.cmp_dot(rand(5), rand(0))
+    def test_dot_1d_1d0(self):
+        self.assertRaises(ValueError, self.cmp_dot, rand(5), rand(0))
    #numpy return matrix not aligned...
-    #def test_dot_1d0_1d(self): self.cmp_dot(rand(0), rand(5))
+    def test_dot_1d0_1d(self):
+        self.assertRaises(ValueError, self.cmp_dot, rand(0), rand(5))
    def test_dot_1d_2d(self):
        self.cmp_dot(rand(6), rand(6, 7))
@@ -4300,8 +4339,12 @@ class t_dot(unittest.TestCase):
    def test_dot_1d0_2d0(self):
        self.cmp_dot(rand(0), rand(0, 0))
-    #def test_dot_1d_3d(self): self.cmp_dot(rand(6), rand(8,6,7))
-    #def test_dot_2d_0d(self): self.cmp_dot(rand(5,6), 1.0)
+    def test_dot_1d_3d(self):
+        self.cmp_dot(rand(6), rand(8,6,7))
+    def test_dot_2d_0d(self):
+        self.cmp_dot(rand(5,6), 1.0)
    def test_dot_2d_1d(self):
        self.cmp_dot(rand(5, 6), rand(6))
@@ -4332,11 +4375,21 @@ class t_dot(unittest.TestCase):
    def test_dot_2d0_0_2d0(self):
        self.cmp_dot(rand(0, 6), rand(6, 0))
-    #def test_dot_2d_3d(self): self.cmp_dot(rand(5,6), rand(8,6,7))
-    #def test_dot_3d_0d(self): self.cmp_dot(rand(4,5,6), 1.0)
+    def test_dot_2d_3d(self):
-    #def test_dot_3d_1d(self): self.cmp_dot(rand(4,5,6), rand(6))
+        self.cmp_dot(rand(5,6), rand(8,6,7))
-    #def test_dot_3d_2d(self): self.cmp_dot(rand(4,5,6), rand(6,7))
-    #def test_dot_3d_3d(self): self.cmp_dot(rand(4,5,6), rand(8,6,7))
+    def test_dot_3d_0d(self):
+        self.cmp_dot(rand(4,5,6), 1.0)
+    def test_dot_3d_1d(self):
+        self.cmp_dot(rand(4,5,6), rand(6))
+    def test_dot_3d_2d(self):
+        self.cmp_dot(rand(4,5,6), rand(6,7))
+    def test_dot_3d_3d(self):
+        self.cmp_dot(rand(4,5,6), rand(8,6,7))
    def not_aligned(self, x, y):
        ctv_backup = config.compute_test_value
@@ -4376,39 +4429,53 @@ class t_dot(unittest.TestCase):
    def test_align_1_2(self):
        self.not_aligned(rand(5), rand(6, 4))
-    #def test_align_1_3(self): self.not_aligned(rand(5), rand(6,4,7))
+    def test_align_1_3(self):
+        self.not_aligned(rand(5), rand(6,4,7))
    def test_align_2_1(self):
        self.not_aligned(rand(5, 4), rand(6))
-    def test_align_2_1(self):
+    def test_align_2_2(self):
        self.not_aligned(rand(5, 4), rand(6, 7))
-    #def test_align_2_3(self): self.not_aligned(rand(5,4), rand(6,7,8))
+    def test_align_2_3(self):
-    #def test_align_3_1(self): self.not_aligned(rand(5,4,3), rand(6))
+        self.not_aligned(rand(5,4), rand(6,7,8))
-    #def test_align_3_2(self): self.not_aligned(rand(5,4,3), rand(6,7))
-    #def test_align_3_3(self): self.not_aligned(rand(5,4,3), rand(6,7,8))
+    def test_align_3_1(self):
+        self.not_aligned(rand(5,4,3), rand(6))
+    def test_align_3_2(self):
+        self.not_aligned(rand(5,4,3), rand(6,7))
+    def test_align_3_3(self):
+        self.not_aligned(rand(5,4,3), rand(6,7,8))
    def test_grad(self):
-        #utt.verify_grad(dot, [rand(2,3,4), rand(4)])
        utt.verify_grad(dot, [rand(2, 3), rand(3, 2)])
        utt.verify_grad(dot, [rand(2), rand(2, 3)])
        utt.verify_grad(dot, [rand(3, 2), rand(2)])
        utt.verify_grad(dot, [rand(2), rand(2)])
-        #utt.verify_grad(dot, [rand(), rand(2)])
+        utt.verify_grad(dot, [rand(), rand(2)])
-        #utt.verify_grad(dot, [rand(), rand(2,5)])
+        utt.verify_grad(dot, [rand(), rand(2,5)])
+        utt.verify_grad(dot, [rand(2), rand()])
+        utt.verify_grad(dot, [rand(2,5), rand()])
+        utt.verify_grad(dot, [rand(2,3,4), rand(4)])
+        utt.verify_grad(dot, [rand(3), rand(2,3,4)])
+        utt.verify_grad(dot, [rand(4,3), rand(2,3,4)])
+        utt.verify_grad(dot, [rand(2,3,4), rand(4,5)])
+        utt.verify_grad(dot, [rand(2,3,4), rand(3,4,5)])
    def test_broadcastable_patterns(self):
        #
-        # These examples hsould all work because we broadcastable or no, all dimensions of all
+        # These examples should all work because we broadcastable or no, all dimensions of all
        # results have size 1.
        #
        def val_for(r):
            if r.dtype.startswith('complex'):
                # We want to test complex at the same time, so we give a value
                # To the imaginary component.
-                # This stange way to doing thing is the only way that worked on
+                # This strange way of doing things is the only way that worked on
                # numpy 1.4.1
                if r.ndim == 0:
                    return numpy.asarray(numpy.complex(1.1, 2.1), dtype=r.dtype)
@@ -4626,7 +4693,8 @@ class T_op_cache(unittest.TestCase):
        utt.seed_rng()
    def test0(self):
-        """trigger bug in ticket #162"""
+        """trigger bug in ticket #162
+        """
        lr = constant(0.011)
        v = matrix()
        v.name = 'v'
@@ -5367,6 +5435,13 @@ class TestPermuteRowElements(unittest.TestCase):
 class test_tensordot(unittest.TestCase):
+    def TensorDot(self, axes):
+        """
+        Since tensordot is no longer an op, mimic the old op signature
+        to allow easy use of verify_grad.
+        """
+        return lambda a, b : tensordot(a, b, axes)
    def setUp(self):
        utt.seed_rng()
@@ -5382,7 +5457,7 @@ class test_tensordot(unittest.TestCase):
        bval = rand(5)
        self.assertTrue(numpy.tensordot(aval, bval, axes) == \
                        f1(aval, bval))
-        utt.verify_grad(TensorDot(axes), [aval, bval])
+        utt.verify_grad(self.TensorDot(axes), [aval, bval])
        # Test matrix-vector
        bmat = matrix()
@@ -5393,7 +5468,7 @@ class test_tensordot(unittest.TestCase):
        bval = rand(8, 5)
        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                       f2(aval, bval)))
-        utt.verify_grad(TensorDot(axes), [aval, bval])
+        utt.verify_grad(self.TensorDot(axes), [aval, bval])
        # Test matrix-matrix
        amat = matrix()
@@ -5412,7 +5487,7 @@ class test_tensordot(unittest.TestCase):
            bval = rand(*shps[1])
            self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                           f3(aval, bval)))
-            utt.verify_grad(TensorDot(axes), [aval, bval])
+            utt.verify_grad(self.TensorDot(axes), [aval, bval])
        # Test ndarray-matrix, sum over one dim of matrix
        for axes, shps in [[((2,), (1,)), [(1, 2, 3, 4), (2, 3)]],
@@ -5430,7 +5505,7 @@ class test_tensordot(unittest.TestCase):
            bval = rand(*shps[1])
            self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                           f4(aval, bval)))
-            utt.verify_grad(TensorDot(axes), [aval, bval])
+            utt.verify_grad(self.TensorDot(axes), [aval, bval])
        # Test ndarray-ndarray
        atens = tensor4()
@@ -5442,14 +5517,14 @@ class test_tensordot(unittest.TestCase):
        bval = rand(3, 4, 2)
        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                       f5(aval, bval)))
-        utt.verify_grad(TensorDot(axes), [aval, bval])
+        utt.verify_grad(self.TensorDot(axes), [aval, bval])
        axes = (axes[1], axes[0])
        c = tensordot(btens, atens, axes)
        f6 = inplace_func([btens, atens], c)
        self.assertTrue(numpy.allclose(numpy.tensordot(bval, aval, axes),
                                       f6(bval, aval)))
-        utt.verify_grad(TensorDot(axes), [bval, aval])
+        utt.verify_grad(self.TensorDot(axes), [bval, aval])
    def test_raise_error(self):
        amat = matrix()
@@ -5457,52 +5532,38 @@ class test_tensordot(unittest.TestCase):
        bvec = vector()
        # Test invalid length for axes
-        try:
+        self.assertRaises(ValueError, tensordot, amat, bmat, (0, 1, 2))
-            c = tensordot(amat, bmat, (0, 1, 2))
-            assert False
-        except ValueError:
-            pass
        # Test axes of uneven length
-        try:
+        self.assertRaises(ValueError, tensordot, amat, bmat, ((0, 1), (0)))
-            c = tensordot(amat, bmat, ((0, 1), (0)))
-            assert False
-        except ValueError:
-            pass
        # Test invalid len(axes) given inputs are matrices
-        try:
+        self.assertRaises(ValueError, tensordot, amat, bmat, ((0,1,2),(0,1,2)))
-            c = tensordot(amat, bmat, ((0, 1, 2), (0, 1, 2)))
-            assert False
-        except ValueError:
-            pass
        # Test invalid axes[1] given that y is a vector
-        try:
+        self.assertRaises(ValueError, tensordot, amat, bvec, (0, 1))
-            c = tensordot(amat, bvec, (0, 1))
-            assert False
-        except ValueError:
-            pass
        # Test invalid scalar axes given inputs are matrices
-        try:
+        self.assertRaises(ValueError, tensordot, amat, bvec, 2)
-            c = tensordot(amat, bvec, 2)
-            assert False
-        except ValueError:
-            pass
    def test_weird_valid_axes(self):
        # Test matrix-matrix
        amat = matrix()
        bmat = matrix()
-        for axes in 0, (1, 0), [1, 0], (1, (0, )), ((1, ), 0), ([1], [0]):
+        for axes in [0,
+                     (1, 0),
+                     [1, 0],
+                     (1, (0, )),
+                     ((1, ), 0),
+                     ([1], [0]),
+                     ([], [])]:
            c = tensordot(amat, bmat, axes)
            f3 = inplace_func([amat, bmat], c)
            aval = rand(4, 7)
            bval = rand(7, 9)
            self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                           f3(aval, bval)))
-            utt.verify_grad(TensorDot(axes), [aval, bval])
+            utt.verify_grad(self.TensorDot(axes), [aval, bval])
    def test_scalar_axes(self):
        # Test matrix-matrix
@@ -5516,7 +5577,7 @@ class test_tensordot(unittest.TestCase):
        f3 = inplace_func([amat, bmat], c)
        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                       f3(aval, bval)))
-        utt.verify_grad(TensorDot(axes), [aval, bval])
+        utt.verify_grad(self.TensorDot(axes), [aval, bval])
        # Test tensor-tensor
        amat = tensor3()
@@ -5528,7 +5589,7 @@ class test_tensordot(unittest.TestCase):
        f3 = inplace_func([amat, bmat], c)
        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                       f3(aval, bval)))
-        utt.verify_grad(TensorDot(axes), [aval, bval])
+        utt.verify_grad(self.TensorDot(axes), [aval, bval])
    def test_scalar0(self):
        # Test tensor-tensor
@@ -5541,26 +5602,7 @@ class test_tensordot(unittest.TestCase):
        f3 = inplace_func([amat, bmat], c)
        self.assertTrue(numpy.allclose(numpy.tensordot(aval, bval, axes),
                                       f3(aval, bval)))
-        utt.verify_grad(TensorDot(axes), [aval, bval])
+        utt.verify_grad(self.TensorDot(axes), [aval, bval])
-    def test_tensordot_grad(self):
-        # We test it manually as we recreate the op in the make_node
-        amat = matrix()
-        bmat = matrix()
-        gzmat = matrix()
-        axes = 1
-        aval = rand(4, 5)
-        bval = rand(5, 3)
-        gzval = rand(4, 3)
-        f1 = inplace_func([amat, bmat, gzmat], tensordot_grad(axes)(
-            amat, bmat, gzmat))
-        f2 = inplace_func([amat, bmat, gzmat], tensordot_grad(((1, ), (
-            0,)))(amat, bmat, gzmat))
-        o1 = f1(aval, bval, gzval)
-        o2 = f2(aval, bval, gzval)
-        self.assertTrue(numpy.allclose(o1[0], o2[0]))
-        self.assertTrue(numpy.allclose(o1[1], o2[1]))
 def test_smallest_stack():
@@ -6390,180 +6432,6 @@ class TestInferShape(utt.InferShapeTester):
    def test_infer_shape(self):
-        # tensordot_grad
-        admat = dmatrix()
-        bdmat = dmatrix()
-        gzdmat = dmatrix()
-        admat_val = rand(4, 5)
-        bdmat_val = rand(5, 3)
-        gzdmat_val = rand(4, 3)
-        axes = 1
-        self._compile_and_check([admat, bdmat, gzdmat],
-                                tensordot_grad(axes)(admat, bdmat, gzdmat),
-                        [admat_val, bdmat_val, gzdmat_val], tensordot_grad)
-        admat_val = rand(5, 4)
-        bdmat_val = rand(5, 4)
-        gzdscal = dscalar()
-        gzdscal_val = rand()
-        axes = 2
-        self._compile_and_check([admat, bdmat, gzdscal],
-                                tensordot_grad(axes)(admat, bdmat, gzdscal),
-                        [admat_val, bdmat_val, gzdscal_val], tensordot_grad)
-        admat_val = rand(4, 5)
-        bdmat_val = rand(5, 3)
-        gzdmat_val = rand(4, 3)
-        axes = ((1, ), (0, ))
-        self._compile_and_check([admat, bdmat, gzdmat],
-                                tensordot_grad(axes)(admat, bdmat, gzdmat),
-                            [admat_val, bdmat_val, gzdmat_val], tensordot_grad)
-        axes = ((1, 0))
-        self._compile_and_check([admat, bdmat, gzdmat],
-                                tensordot_grad(axes)(admat, bdmat, gzdmat),
-                            [admat_val, bdmat_val, gzdmat_val], tensordot_grad)
-        admat_val = rand(4, 5)
-        bdmat_val = rand(3, 4)
-        gzdmat_val = rand(5, 3)
-        axes = ((0, ), (1, ))
-        self._compile_and_check([admat, bdmat, gzdmat],
-                                tensordot_grad(axes)(admat, bdmat, gzdmat),
-                            [admat_val, bdmat_val, gzdmat_val], tensordot_grad)
-        gzdscal = dscalar()
-        admat_val = rand(5, 4)
-        bdmat_val = rand(5, 4)
-        gzdscal_val = rand()
-        axes = ((0, 1), (0, 1))
-        self._compile_and_check([admat, bdmat, gzdscal],
-                                tensordot_grad(axes)(admat, bdmat, gzdscal),
-                        [admat_val, bdmat_val, gzdscal_val], tensordot_grad)
-        # tensordot_grad currently do not support not ordered axes
-        """
-        gzdscal = dscalar()
-        admat_val = rand(5, 4)
-        bdmat_val = rand(4, 5)
-        gzdscal_val = rand()
-        axes = ((0, 1), (1, 0))
-        self._compile_and_check([admat, bdmat, gzdscal],
-                                tensordot_grad(axes)(admat, bdmat, gzdscal),
-                        [admat_val, bdmat_val, gzdscal_val], tensordot_grad)
-        gzdscal = dscalar()
-        admat_val = rand(5, 4)
-        bdmat_val = rand(5, 4)
-        gzdscal_val = rand()
-        axes = ((1, 0 ), (1, 0))
-        self._compile_and_check([admat, bdmat, gzdscal],
-                                tensordot_grad(axes)(admat, bdmat, gzdscal),
-                        [admat_val, bdmat_val, gzdscal_val], tensordot_grad)
-        """
-        # tensordot
-        admat = dmatrix()
-        bdmat = dmatrix()
-        admat_val = rand(4, 5)
-        bdmat_val = rand(5, 3)
-        axes = 1
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                            [admat_val, bdmat_val], TensorDot)
-        admat_val = rand(5, 4)
-        bdmat_val = rand(5, 4)
-        axes = 2
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                            [admat_val, bdmat_val], TensorDot)
-        admat_val = rand(4, 5)
-        bdmat_val = rand(5, 3)
-        axes = ((1, ), (0, ))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                            [admat_val, bdmat_val], TensorDot)
-        axes = ((1, 0))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                            [admat_val, bdmat_val], TensorDot)
-        admat_val = rand(4, 5)
-        bdmat_val = rand(3, 4)
-        axes = ((0, ), (1, ))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                            [admat_val, bdmat_val], TensorDot)
-        axes = ((0, 1))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                            [admat_val, bdmat_val], TensorDot)
-        admat_val = rand(5, 4)
-        bdmat_val = rand(4, 5)
-        axes = ((1,), (0,))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                        [admat_val, bdmat_val], TensorDot)
-        axes = ((0, 1), (1, 0))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                        [admat_val, bdmat_val], TensorDot)
-        admat_val = rand(5, 4)
-        bdmat_val = rand(5, 4)
-        axes = ((0, 1), (0, 1))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                        [admat_val, bdmat_val], TensorDot)
-        admat_val = rand(5, 4)
-        bdmat_val = rand(4, 5)
-        axes = ((1, 0), (0, 1))
-        self._compile_and_check([admat, bdmat],
-                                [TensorDot(axes)(admat, bdmat)],
-                        [admat_val, bdmat_val], TensorDot)
-        adtens3 = dtensor3()
-        admat_val = rand(5, 4)
-        adtens3_val = rand(5, 4, 3)
-        axes = 2
-        self._compile_and_check([admat, adtens3],
-                                [TensorDot(axes)(admat, adtens3)],
-                        [admat_val, adtens3_val], TensorDot)
-        adtens3_val = rand(4, 5, 3)
-        axes = ((1, 0), (0, 1))
-        self._compile_and_check([admat, adtens3],
-                                [TensorDot(axes)(admat, adtens3)],
-                        [admat_val, adtens3_val], TensorDot)
-        adtens3_val = rand(4, 3, 5)
-        axes = ((1, 0), (0, 2))
-        self._compile_and_check([admat, adtens3],
-                                [TensorDot(axes)(admat, adtens3)],
-                        [admat_val, adtens3_val], TensorDot)
-        adtens4 = dtensor4()
-        admat_val = rand(5, 4)
-        adtens4_val = rand(5, 4, 3, 2)
-        axes = 2
-        self._compile_and_check([admat, adtens4],
-                                [TensorDot(axes)(admat, adtens4)],
-                        [admat_val, adtens4_val], TensorDot)
-        adtens4_val = rand(4, 3, 2, 5)
-        axes = ((1, 0), (0, 3))
-        self._compile_and_check([admat, adtens4],
-                                [TensorDot(axes)(admat, adtens4)],
-                        [admat_val, adtens4_val], TensorDot)
        # Flatten
        atens3 = tensor3()
        atens3_val = rand(4, 5, 3)
@@ -6640,6 +6508,8 @@ class TestInferShape(utt.InferShapeTester):
                                [adtens_val], (opt.MakeVector, Shape))
        # Dot
+        #vec/vec
        advec = dvector()
        bdvec = dvector()
        advec_val = rand(4)
@@ -6649,6 +6519,9 @@ class TestInferShape(utt.InferShapeTester):
                                [advec_val, bdvec_val],
                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
+        #mat/mat
+        admat = dmatrix()
+        bdmat = dmatrix()
        admat_val = rand(4, 5)
        bdmat_val = rand(5, 3)
        self._compile_and_check([admat, bdmat],
@@ -6656,18 +6529,20 @@ class TestInferShape(utt.InferShapeTester):
                                [admat_val, bdmat_val],
                                (Dot, tensor.blas.Dot22))
-        admat_val = rand(5, 4)
+        #vec/mat
-        self._compile_and_check([admat, advec],
-                                [Dot()(admat, advec)],
-                                [admat_val, advec_val],
-                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
        bdmat_val = rand(4, 5)
        self._compile_and_check([advec, bdmat],
                                [Dot()(advec, bdmat)],
                                [advec_val, bdmat_val],
                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
+        #mat/vec
+        admat_val = rand(5, 4)
+        self._compile_and_check([admat, bdvec],
+                                [Dot()(admat, bdvec)],
+                                [admat_val, bdvec_val],
+                                (Dot, tensor.blas.Gemv, tensor.blas_c.CGemv))
        # Split
        aivec = ivector()
        adtens_val = rand(4, 10, 3)

--- a/theano/tensor/tests/test_blas.py
+++ b/theano/tensor/tests/test_blas.py
@@ -14,7 +14,6 @@ from numpy import (arange, array, common_type, complex64, complex128, float32,
 from numpy.testing import assert_array_almost_equal
 #from numpy.testing import dec
 #from numpy.testing.noseclasses import KnownFailureTest
 from theano.tensor.blas import (_dot22, _dot22scalar, res_is_a, _as_scalar,
                                _is_real_matrix, _gemm_canonicalize,
                                _factor_canonicalized, Gemm, Gemv,
@@ -479,7 +478,7 @@ def just_gemm(i, o, ishapes=[(4, 3), (3, 5), (4, 5), (), ()],
                on_unused_input='ignore')
        nb_gemm = 0
        for node in f.maker.fgraph.apply_nodes:
-            if node.op == T.dot:
+            if isinstance(node.op, T.Dot):
                raise Failure('dot not changed to gemm_inplace in graph')
            if node.op == _dot22:
                raise Failure('_dot22 not changed to gemm_inplace in graph')
@@ -562,7 +561,7 @@ def test_gemm_opt_double_gemm():
        f = inplace_func([Param(ii, mutable=True) for ii in i], o,
                mode='FAST_RUN', on_unused_input='ignore')
        for node in f.maker.fgraph.apply_nodes:
-            if node.op == T.dot:
+            if isinstance(node.op, T.Dot):
                raise Failure('dot in graph')
            if node.op == _dot22:
                raise Failure('_dot22 in graph')
@@ -857,7 +856,9 @@ def test_dot22():
            if dtype1 == dtype2:
                assert _dot22 in [x.op for x in topo], (dtype1, dtype2)
            else:
-                assert T.dot in [x.op for x in topo], (dtype1, dtype2)
+                check = [isinstance(x.op, T.Dot) for x in topo]
+                from theano.gof.python25 import any
+                assert any(check), (dtype1, dtype2)
            rng = numpy.random.RandomState(unittest_tools.fetch_seed())
            def cmp(a_shp, b_shp):
@@ -919,8 +920,8 @@ def test_dot22scalar():
                            assert _dot22 in ops, (dtype1, dtype2,
                                                   dtype3, dtype4)
                        else:
-                            assert T.dot in ops, (dtype1, dtype2,
+                            check = [isinstance(o, T.Dot) for o in ops]
-                                                  dtype3, dtype4)
+                            assert any(check), (dtype1, dtype2, dtype3, dtype4)
                    def cmp(a_shp, b_shp, c_shp, sqr_shp=(5, 5)):
                        av = rng.uniform(size=a_shp).astype(dtype1)