removed GPU-specific tensordot and optimization; no longer needed.

1ba75ec3 · Jeremiah Lowin · f654e792 · 1ba75ec3 · 1ba75ec3 · 1ba75ec3
--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2837,54 +2837,6 @@ class GpuContiguous(GpuOp):
 gpu_contiguous = GpuContiguous()


-def tensordot(a, b, axes=2):
-    """
-    Implementation of tensordot that reduces to a regular matrix product.
-
-    This allows tensordot to be GPU accelerated, which isn't possible
-    with the default Theano implementation (which is just a wrapper
-    around numpy.tensordot). based on code from Tijmen Tieleman's gnumpy
-    http://www.cs.toronto.edu/~tijmen/gnumpy.html
-    """
-    if numpy.isscalar(axes):
-        # if 'axes' is a number of axes to multiply and sum over (trailing axes
-        # of a, leading axes of b), we can just reshape and use dot.
-        outshape = tensor.concatenate([a.shape[:a.ndim - axes],
-                                      b.shape[axes:]])
-        outndim = a.ndim + b.ndim - (2 * axes)
-        a_reshaped = a.reshape((tensor.prod(a.shape[:a.ndim - axes]),
-                                tensor.prod(a.shape[a.ndim - axes:])))
-        b_reshaped = b.reshape((tensor.prod(b.shape[:axes]),
-                                tensor.prod(b.shape[axes:])))
-        assert a_reshaped.ndim == 2
-        assert b_reshaped.ndim == 2
-        # We use _dot22 here because:
-        #   - we know that the number of dimensions will be 2
-        #   - it makes it possible for the computation to be moved to GPU
-        # When cuda.opt.local_gpu_tensordot is applied, it is too late
-        # for the usual blas optimizations to take place.
-        # This will change if we decide to get rid of tensor.tensordot,
-        # and always use this version.
-        return tensor.blas._dot22(a_reshaped, b_reshaped).reshape(
-                outshape, ndim=outndim)
-    elif len(axes) == 2:
-        # if 'axes' is a pair of axis lists, we first shuffle the axes of a and
-        # b to reduce this to the first case (note the recursion).
-        a_other, b_other = tuple(axes[0]), tuple(axes[1])
-        num_axes = len(a_other)
-        a_order = (tuple(x for x in tuple(xrange(a.ndim)) if x not in a_other)
-                + a_other)
-        b_order = (b_other
-                + tuple(x for x in tuple(xrange(b.ndim)) if x not in b_other))
-        a_shuffled = a.dimshuffle(a_order)
-        b_shuffled = b.dimshuffle(b_order)
-        return tensordot(a_shuffled, b_shuffled, num_axes)
-    else:
-        raise ValueError(
-            "Axes should be scalar valued or a list/tuple of len 2.",
-            axes)
-
-
 # Those are predifined CudaNdarrayType as done in tensor.basic
 # Useful mostly for test as the gpu op are inserted automatically...
 def scalar(name=None, dtype=None):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -891,35 +891,6 @@ def local_gpu_print_op(node):
    return False


-@register_opt()
-@local_optimizer([tensor.TensorDot])
-def local_gpu_tensordot(node):
-    '''
-    T.tensordot(host_from_gpu) -> basic_ops.tensordot(host_from_gpu)
-
-    There is no Cuda Op for tensordot, however we can build a chain of
-    CPU Ops implementing tensordot. These Ops all have a GPU equivalent.
-
-    Note: applying this optimization at that stage is not ideal, because
-    all blas-related optimizations have already been applied.
-    However, if we want to apply it before the blas optimizations, then
-    we don't know which variables may end up on the GPU or not.
-    '''
-    if (isinstance(node.op, tensor.TensorDot) and
-            node.outputs[0].dtype == 'float32'):
-        x, y = node.inputs
-        if ((x.owner and
-                x.owner.op == host_from_gpu and
-                y.dtype == 'float32') or
-            (y.owner and
-                y.owner.op == host_from_gpu and
-                x.dtype == 'float32')):
-
-            axes = node.op.axes
-            out = tensordot(x, y, axes=axes)
-            return [out]
-
-
 def cast(x, dtype):
    stype = scal.Scalar(dtype)
    cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -1129,54 +1129,6 @@ def test_shared_cudandarray():
    assert isinstance(a.type, tcn.CudaNdarrayType)


-class test_tensordot_reshape(unittest.TestCase):
-    '''Test alternative tensordot implementation.
-
-    Test that the tensordot implementation using dimshuffle, reshape and dot
-    gives the same results as the default (numpy) version.
-    '''
-
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-
-    def test1(self):
-        # define some tensors
-        tensor1 = self.rng.rand(20, 10, 5, 8).astype(theano.config.floatX)
-        tensor2 = self.rng.rand(5, 8, 20).astype(theano.config.floatX)
-        tensor3 = self.rng.rand(8, 20, 5).astype(theano.config.floatX)
-
-        x = T.tensor4('x')
-        y = T.tensor3('y')
-
-        # case 1: number of axes to sum over
-        default1 = theano.function([x, y], T.tensordot(x, y, 2))(
-                tensor1, tensor2)
-        reshape1 = theano.function([x, y], B.tensordot(x, y, 2))(
-                tensor1, tensor2)
-        assert numpy.allclose(default1, reshape1)
-
-        # case 2: axis pairs
-        default2 = theano.function(
-                [x, y],
-                T.tensordot(x, y, axes=[(0, 3), (1, 0)])
-                )(tensor1, tensor3)
-        reshape2 = theano.function(
-                [x, y],
-                B.tensordot(x, y, axes=[(0, 3), (1, 0)])
-                )(tensor1, tensor3)
-        assert numpy.allclose(default2, reshape2)
-
-        default3 = theano.function(
-                [x, y],
-                T.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-                )(tensor1, tensor3)
-        reshape3 = theano.function(
-                [x, y],
-                B.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-                )(tensor1, tensor3)
-        assert numpy.allclose(default3, reshape3)
-
-
 class test_size(unittest.TestCase):

    """

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -333,38 +333,6 @@ def test_elemwise_fusion():
      theano._asarray(numpy.random.rand(*shape), dtype='float32'))


-class test_local_gpu_tensordot(unittest.TestCase):
-    def setUp(self):
-        self.rng = numpy.random.RandomState(utt.fetch_seed())
-
-    def test_transfer(self):
-        tensor1 = self.rng.rand(20, 10, 5, 8).astype('float32')
-        tensor2 = self.rng.rand(5, 8, 20).astype('float32')
-        tensor3 = self.rng.rand(8, 20, 5).astype('float32')
-
-        x = tensor.ftensor4('x')
-        y = tensor.ftensor3('y')
-
-        tdot1 = tensor.tensordot(x, y, 2)
-        f1 = theano.function([x, y], tdot1, mode=mode_with_gpu)
-        topo1 = f1.maker.fgraph.toposort()
-        assert topo1[-1].op == cuda.host_from_gpu
-        # Let DebugMode debug
-        f1(tensor1, tensor2)
-
-        tdot2 = tensor.tensordot(x, y, axes=[(0, 3), (1, 0)])
-        f2 = theano.function([x, y], tdot2, mode=mode_with_gpu)
-        topo2 = f2.maker.fgraph.toposort()
-        assert topo2[-1].op == cuda.host_from_gpu
-        f2(tensor1, tensor3)
-
-        tdot3 = tensor.tensordot(x, y, axes=[(0, 3, 2), (1, 0, 2)])
-        f3 = theano.function([x, y], tdot3, mode=mode_with_gpu)
-        topo3 = f3.maker.fgraph.toposort()
-        assert topo3[-1].op == cuda.host_from_gpu
-        f3(tensor1, tensor3)
-
-
 import theano.tests.test_ifelse