Remove tentacles in tensor.

1f34a482 · Arnaud Bergeron · 1b22389f · 1f34a482 · 1f34a482 · 1f34a482
--- a/theano/gpuarray/tests/test_others.py
+++ b/theano/gpuarray/tests/test_others.py
-from .config import test_ctx_name
+from .config import test_ctx_name, mode_with_gpu
-from ..type import get_context, GpuArrayType, GpuArraySharedVariable
+from ..type import (get_context, GpuArrayType, GpuArraySharedVariable,
+                    gpuarray_shared_constructor)
 import pygpu
 import numpy as np
 from theano.misc.tests.test_may_share_memory import may_share_memory_core
 from theano.misc.pkl_utils import dump, load
+from theano.tensor.tests.test_opt import test_fusion as t_fusion
+class test_fusion(t_fusion):
+    mode = mode_with_gpu
+    shared = gpuarray_shared_constructor
 def test_may_share_memory():
    ctx = get_context(test_ctx_name)

--- a/theano/gpuarray/tests/test_type.py
+++ b/theano/gpuarray/tests/test_type.py
@@ -9,6 +9,8 @@ from theano import config
 from theano.compile import DeepCopyOp
 from theano.misc.pkl_utils import CompatUnpickler
+from theano.tensor.tests.test_sharedvar import makeSharedTester
 from .config import test_ctx_name
 from .test_basic_ops import rand_gpuarray
 from ..type import GpuArrayType, gpuarray_shared_constructor
@@ -76,3 +78,41 @@ def test_unpickle_gpuarray_as_numpy_ndarray_flag0():
            assert np.asarray(mat)[0] == -42.0
    finally:
        config.experimental.unpickle_gpu_on_cpu = oldflag
+test_shared_options = makeSharedTester(
+    shared_constructor_=gpuarray_shared_constructor,
+    dtype_=theano.config.floatX,
+    get_value_borrow_true_alias_=True,
+    shared_borrow_true_alias_=True,
+    set_value_borrow_true_alias_=True,
+    set_value_inplace_=True,
+    set_cast_value_inplace_=False,
+    shared_constructor_accept_ndarray_=True,
+    internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
+                                         cls=pygpu._array.ndgpuarray),
+    test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
+    theano_fct_=theano.tensor.exp,
+    ref_fct_=numpy.exp,
+    cast_value_=lambda v: pygpu.asarray(v, context=get_context(test_ctx_name),
+                                        cls=pygpu._array.ndgpuarray),
+    name='test_shared_options')
+test_shared_options2 = makeSharedTester(
+    shared_constructor_=gpuarray_shared_constructor
+    dtype_=theano.config.floatX,
+    get_value_borrow_true_alias_=False,
+    shared_borrow_true_alias_=False,
+    set_value_borrow_true_alias_=False,
+    set_value_inplace_=True,
+    set_cast_value_inplace_=True,
+    shared_constructor_accept_ndarray_=True,
+    internal_type_=lambda v: pygpu.array(v, context=get_context(test_ctx_name),
+                                         cls=pygpu._array.ndgpuarray),
+    test_internal_type_=lambda a: isinstance(a, pygpu.gpuarray.GpuArray),
+    theano_fct_=theano.tensor.exp,
+    ref_fct_=numpy.exp,
+    cast_value_=lambda v: pygpu.asarray(v, context=get_context(test_ctx_name),
+                                        cls=pygpu._array.ndgpuarray),
+    name='test_shared_options2')
--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -107,22 +107,6 @@ def __oplist_tag(thing, tag):
    thing.__oplist_tags = tags
-if 0:
-    # this starts to feel like we're enumerating all the types
-    # the one place where this is used we should also allow for sparse
-    # variables
-    # - JB 20100226
-    def as_cuda_or_tensor_variable(x, name=None, ndim=None):
-        """
-        Do the same as_tensor_variable,
-        but do not transfer the value on the gpu.
-        """
-        if hasattr(x, '_as_CudaNdarrayVariable'):
-            # TODO: pass name and ndim arguments
-            return x._as_CudaNdarrayVariable()
-        return as_tensor_variable(x, name, ndim)
 def as_tensor_variable(x, name=None, ndim=None):
    """Return `x`, transformed into a `TensorType`.

--- a/theano/tensor/blas.py
+++ b/theano/tensor/blas.py
@@ -15,7 +15,7 @@ There are four kinds of BLAS Ops in Theano:
    - Python implementations (this file)
    - SciPy-based (blas_scipy)
    - C-based (blas_c)
-    - CUDA-based (theano.sandbox.cuda.blas)
+    - GPU-based (theano.gpuarray)
 Notes
 -----

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
 from __future__ import absolute_import, print_function, division
-import sys
 from copy import copy
 import numpy as np

--- a/theano/tensor/nnet/Conv3D.py
+++ b/theano/tensor/nnet/Conv3D.py
@@ -573,10 +573,7 @@ def conv3D(V, W, b, d):
    The order of dimensions does not correspond to the one in `conv2d`.
    This is for optimization.
-    The GPU implementation is very slow. You should use
+    Please use nnet.conv3d instead of this for a faster GPU implementation.
-    :func:`conv3d2d <theano.tensor.nnet.conv3d2d.conv3d>` or
-    :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>` for a
-    GPU graph instead.
    See Also
    --------

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -903,7 +903,6 @@ class ConvOp(OpenMPOp):
        newin = inputs.dimshuffle((1, 0, 2, 3))
        newgz = gz.dimshuffle((1, 0, 2, 3))
-        un_p = self.unroll_patch
        if self.out_mode == 'valid':
            (img, filters) = (newin, newgz)
            kshp_logical = self.fulloutshp
@@ -912,8 +911,6 @@ class ConvOp(OpenMPOp):
            (bsize, nkern) = (self.imshp[0], self.nkern)
            imshp = (self.bsize, self.imshp[1], self.imshp[2])
            kshp = self.outshp
-            un_b = self.unroll_batch
-            un_k = self.unroll_kern
        elif self.out_mode == 'full':
            (img, filters) = (newgz, newin)
            kshp_logical = None
@@ -924,8 +921,6 @@ class ConvOp(OpenMPOp):
            (bsize, nkern) = (self.nkern, self.imshp[0])
            imshp = (self.bsize, self.outshp[0], self.outshp[1])
            kshp = self.imshp[1:]
-            un_b = self.unroll_kern
-            un_k = self.unroll_batch
        else:
            raise NotImplementedError(
                'Only [full,valid] modes are currently supported.')

--- a/theano/tensor/nnet/conv3d2d.py
+++ b/theano/tensor/nnet/conv3d2d.py
@@ -4,7 +4,6 @@ from theano.gradient import DisconnectedType
 from theano.gof import Op, Apply, TopoOptimizer
 from theano.gof.opt import copy_stack_trace
 from theano import tensor
-import theano.sandbox.cuda as cuda
 def get_diagonal_subtensor_view(x, i0, i1):
@@ -16,7 +15,7 @@ def get_diagonal_subtensor_view(x, i0, i1):
    It returns a partial view of x, not a partial copy.
    """
-    # We have to cast i0 and i0 to int because python 2.4 (and maybe later)
+    # We have to cast i0 and i0 to int because python
    # do not support indexing with 0-dim, 'int*' ndarrays.
    i0 = int(i0)
    i1 = int(i1)
@@ -198,8 +197,7 @@ def conv3d(signals, filters,
    Another way to define signals: (batch,  time, in channel, row, column)
    Another way to define filters: (out channel,time,in channel, row, column)
-    For the GPU, you can use this implementation or
+    For the GPU, use nnet.conv3d.
-    :func:`conv3d_fft <theano.sandbox.cuda.fftconv.conv3d_fft>`.
    See Also
    --------
@@ -295,67 +293,6 @@ def conv3d(signals, filters,
    return out_5d
-def make_gpu_optimizer(op, to_gpu):
-    """
-    This function create optimizer that move some inputs to the GPU
-    for op that work on both CPU and GPU.
-    The op object is created by calling op(), so good default value
-    are needed.
-    We suppose the same op work with CPU and GPU inputs.
-    Parameters
-    ----------
-    op
-        The op that support GPU inputs.
-    to_gpu
-        A list of op inputs that are moved to the GPU.
-    """
-    @theano.gof.local_optimizer([op, cuda.gpu_from_host])
-    def local_to_gpu(node):
-        """
-        op(host_from_gpu()) -> host_from_gpu(op)
-        gpu_from_host(op) -> op(gpu_from_host)
-        """
-        if isinstance(node.op, op):
-            # op(host_from_gpu()) -> host_from_gpu(op)
-            # If any of the input that go on the GPU are on the GPU,
-            # move the op to the gpu.
-            if any(node.inputs[idx].owner and
-                   isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu)
-                   for idx in to_gpu):
-                new_inp = list(node.inputs)
-                for idx in to_gpu:
-                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
-                result_node = op()(*new_inp)
-                copy_stack_trace(node.outputs[0], result_node)
-                transfer_node = result_node.transfer('cpu')
-                copy_stack_trace(node.outputs[0], transfer_node)
-                return [transfer_node]
-        if node.op == cuda.gpu_from_host:
-            # gpu_from_host(op) -> op(gpu_from_host)
-            host_input = node.inputs[0]
-            if host_input.owner and isinstance(host_input.owner.op,
-                                               op):
-                op_node = host_input.owner
-                new_inp = list(op_node.inputs)
-                for idx in to_gpu:
-                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
-                new_node = op()(*new_inp)
-                copy_stack_trace(host_input, new_node)
-                return [new_node]
-        return False
-    local_to_gpu.__name__ = "local_to_gpu_" + op.__name__
-    cuda.opt.register_opt()(local_to_gpu)
-if cuda.cuda_available:
-    make_gpu_optimizer(DiagonalSubtensor, [0])
-    make_gpu_optimizer(IncDiagonalSubtensor, [0, 3])
 @theano.gof.local_optimizer([DiagonalSubtensor, IncDiagonalSubtensor])
 def local_inplace_DiagonalSubtensor(node):
    """Also work for IncDiagonalSubtensor."""

--- a/theano/tensor/nnet/tests/test_conv3d2d.py
+++ b/theano/tensor/nnet/tests/test_conv3d2d.py
@@ -16,12 +16,6 @@ from theano.tensor.nnet.conv3d2d import conv3d, get_diagonal_subtensor_view, Dia
 import theano.tests.unittest_tools as utt
-if theano.config.mode == 'FAST_COMPILE':
-    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
-else:
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
 def test_get_diagonal_subtensor_view(wrap=lambda a: a):
    x = numpy.arange(20).reshape(5, 4).astype('float32')
    x = wrap(x)
@@ -106,17 +100,11 @@ def check_diagonal_subtensor_view_traces(fn):
 @parameterized.expand(('valid', 'full', 'half'), utt.custom_name_func)
 def test_conv3d(border_mode):
-    check_conv3d(border_mode=border_mode,
-                 mode=mode_without_gpu,
-                 shared=theano.tensor._shared)
-# This function will also be used in theano/sandbox/cuda/tests/test_tensor_op.py,
-# which is not possible if it is decorated by @parameterized.expand
-def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._shared):
    if ndimage is None or not theano.config.cxx:
        raise SkipTest("conv3d2d tests need SciPy and a c++ compiler")
+    shared = theano.tensor._shared
    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
    Nf, Tf, C, Hf, Wf = 32, 5, 3, 5, 5
@@ -137,8 +125,7 @@ def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._share
                 border_mode=border_mode)
    newconv3d = theano.function([], [],
-                                updates={s_output: out},
+                                updates={s_output: out})
-                                mode=mode)
    check_diagonal_subtensor_view_traces(newconv3d)
    t0 = time.time()
@@ -149,7 +136,6 @@ def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._share
    gnewconv3d = theano.function([], [],
                                 updates=[(s_filters, gfilters),
                                          (s_signals, gsignals)],
-                                 mode=mode,
                                 name='grad')
    check_diagonal_subtensor_view_traces(gnewconv3d)
@@ -163,7 +149,7 @@ def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._share
    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
    utt.verify_grad(lambda s, f: conv3d(s, f, border_mode=border_mode),
-                    [signals, filters], eps=1e-1, mode=mode)
+                    [signals, filters], eps=1e-1)
    # Additional Test that covers the case of patched implementation for filter with Tf=1
    Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32
@@ -186,8 +172,7 @@ def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._share
                 border_mode=border_mode)
    newconv3d = theano.function([], [],
-                                updates={s_output: out},
+                                updates={s_output: out})
-                                mode=mode)
    t0 = time.time()
    newconv3d()
@@ -197,7 +182,6 @@ def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._share
    gnewconv3d = theano.function([], [],
                                 updates=[(s_filters, gfilters),
                                          (s_signals, gsignals)],
-                                 mode=mode,
                                 name='grad')
    t0 = time.time()
@@ -210,4 +194,4 @@ def check_conv3d(border_mode, mode=mode_without_gpu, shared=theano.tensor._share
    signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32')
    filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32')
    utt.verify_grad(lambda s, f: conv3d(s, f, border_mode=border_mode),
-                    [signals, filters], eps=1e-1, mode=mode)
+                    [signals, filters], eps=1e-1)
--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -640,7 +640,7 @@ class Subtensor(Op):
                      strides_mul=None):
        """
        The parameters c_prefix are there to allow reusing this
-        function on PyArray and CudaNdarray object.
+        function on PyArray and GpuArray object.
        This fct take as input the x.
@@ -1373,7 +1373,7 @@ class IncSubtensor(Op):
        # but subclasses may override the helper methods
        # to change the particulars, e.g. GpuIncSubtensor
        # turns the view/copy operations on numpy arrays
-        # into the same operations on cuda arrays.
+        # into the same operations on gpu arrays.
        self.do_type_checking(node)

--- a/theano/tensor/tests/shape_opt_cycle.pkl
+++ b/theano/tensor/tests/shape_opt_cycle.pkl
--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2,8 +2,6 @@ from __future__ import absolute_import, print_function, division
 import copy
 import logging
-import os
-import sys
 import time
 import unittest
@@ -14,7 +12,7 @@ from nose.tools import assert_raises, assert_true
 import theano
 import theano.scalar as scal
-from six import PY3, StringIO
+from six import StringIO
 from theano import compile
 from theano.compile import deep_copy_op, DeepCopyOp
 from theano.compile import get_mode
@@ -909,7 +907,10 @@ def test_const_type_in_mul_canonizer():
 class test_fusion(unittest.TestCase):
-    def do(self, mode, shared_fn, shp, gpu=False, nb_repeat=1, assert_len_topo=True, slice=None):
+    mode = copy.copy(compile.mode.get_default_mode())
+    _shared = shared
+    def do(self, mode, shared_fn, shp, nb_repeat=1, assert_len_topo=True, slice=None):
        """
        param shared_fn: if None, will use compile.function
        verify that the elemwise fusion work
@@ -1103,14 +1104,9 @@ class test_fusion(unittest.TestCase):
                 nb_elemwise, answer, out_dtype] in enumerate(cases):
            if isinstance(out_dtype, dict):
                out_dtype = out_dtype[config.cast_policy]
-            if (gpu and (out_dtype != 'float32' or
-                         any(i.dtype != 'float32' for i in g.owner.inputs))):
-                print("Skip test %d as the gpu code currently supports only float32" % id)
-                continue
            print("new cases", id)
            if shared_fn is None:
-                assert gpu is False
                f = compile.function(list(sym_inputs), g, mode=mode)
                for x in xrange(nb_repeat):
                    out = f(*val_inputs)
@@ -1139,17 +1135,7 @@ class test_fusion(unittest.TestCase):
                print(out)
                print(answer * nb_repeat)
            topo = f.maker.fgraph.toposort()
-            if gpu:
+            topo_ = topo
-                import theano.sandbox.cuda as cuda
-                topo_ = [x for x in topo if not isinstance(
-                    x.op, (cuda.basic_ops.GpuFromHost, cuda.basic_ops.HostFromGpu))]
-                gpu_ = [x for x in topo
-                        if isinstance(x.op, cuda.basic_ops.GpuFromHost)]
-                if not len(gpu_) == len(sym_inputs):
-                    fail2.append((id, gpu_, sym_inputs))
-            else:
-                topo_ = topo
            if assert_len_topo:
                if not len(topo_) == nb_elemwise:
                    fail3.append((id, topo_, nb_elemwise))
@@ -1177,62 +1163,24 @@ class test_fusion(unittest.TestCase):
    def test_elemwise_fusion(self):
        shp = (5, 5)
-        mode = copy.copy(compile.mode.get_default_mode())
+        mode = copy.copy(self.mode)
        # we need the optimisation enabled and the canonicalize.
        # the canonicalize is needed to merge multiplication/addition by constant.
        mode._optimizer = mode._optimizer.including(
            'local_elemwise_fusion', 'composite_elemwise_fusion',
            'canonicalize')
-        self.do(mode, shared, shp)
+        self.do(mode, self._shared, shp)
    @attr('slow')
    def test_elemwise_fusion_4d(self):
        shp = (3, 3, 3, 3)
-        mode = copy.copy(compile.mode.get_default_mode())
+        mode = copy.copy(self.mode)
        # we need the optimisation enabled and the canonicalize.
        # the canonicalize is needed to merge multiplication/addition by constant.
        mode._optimizer = mode._optimizer.including(
            'local_elemwise_fusion', 'composite_elemwise_fusion',
            'canonicalize')
-        self.do(mode, shared, shp)
+        self.do(mode, self._shared, shp)
-    def test_gpu_fusion(self):
-        shp = (5, 5)
-        # we need the optimisation enabled, debug do this.
-        if theano.config.mode == "FAST_COMPILE":
-            mode = theano.compile.mode.get_mode("FAST_RUN").including(
-                'local_elemwise_fusion', 'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
-        else:
-            mode = theano.compile.mode.get_default_mode().including(
-                'local_elemwise_fusion', 'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
-        import theano.sandbox.cuda as cuda
-        if not cuda.cuda_available:
-            raise SkipTest("cuda not available")
-        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
-    @attr('slow')
-    def test_gpu_fusion_Xd(self):
-        # we need the optimisation enabled, debug do this.
-        if theano.config.mode == "FAST_COMPILE":
-            mode = theano.compile.mode.get_mode("FAST_RUN").including(
-                'local_elemwise_fusion', 'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
-        else:
-            mode = theano.compile.mode.get_default_mode().including(
-                'local_elemwise_fusion', 'composite_elemwise_fusion',
-                'canonicalize', 'gpu')
-        import theano.sandbox.cuda as cuda
-        if not cuda.cuda_available:
-            raise SkipTest("cuda not available")
-        sizes = cuda.opt.get_device_type_sizes()
-        if sizes['int_size'] == 4:
-            shp = (5, 5, 5, 5)
-        else:
-            shp = (5, 5, 5)
-        self.do(mode, cuda.float32_shared_constructor, shp, gpu=True)
    def test_fusion_35inputs(self):
        # Make sure a fused graph with more than 35 inputs does not segfault
@@ -1244,7 +1192,7 @@ class test_fusion(unittest.TestCase):
        for idx in xrange(1, 35):
            out = tensor.sin(inpts[idx] + out)
-        f = function(inpts, out)
+        f = function(inpts, out, mode=self.mode)
        # Test it on some dummy values
        f(*[list(range(i, 4 + i)) for i in xrange(35)])
@@ -1280,7 +1228,7 @@ class test_fusion(unittest.TestCase):
        dlogp = function(vars, [theano.grad(logp, v) for v in vars])
        dlogp(2, np.random.rand(n))
-    def speed_fusion(self, shared_fn=shared, gpu=False, s=None):
+    def speed_fusion(self, s=None):
        """
        param type s: a slice object
        param s: a slice to apply to the case to execute. If None, exec all case.
@@ -1292,18 +1240,18 @@ class test_fusion(unittest.TestCase):
        # linker=gof.CLinker
        # linker=gof.OpWiseCLinker
-        mode1 = copy.copy(compile.get_default_mode())
+        mode1 = copy.copy(self.mode)
        mode1._optimizer = mode1._optimizer.including('local_elemwise_fusion')
        # TODO:clinker is much faster... but use to much memory
        # Possible cause: as their is do deletion of intermediate value when we don't keep the fct.
        # More plausible cause: we keep a link to the output data?
        # Follow up. Clinker do the same... second cause?
-        mode2 = copy.copy(compile.get_default_mode())
+        mode2 = copy.copy(self.mode)
        mode2._optimizer = mode2._optimizer.excluding('local_elemwise_fusion')
        print("test with linker", str(mode1.linker))
-        times1 = self.do(mode1, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat,
+        times1 = self.do(mode1, self._shared, shp, nb_repeat=nb_repeat,
                         assert_len_topo=False, slice=s)
-        times2 = self.do(mode2, shared_fn, shp, gpu=gpu, nb_repeat=nb_repeat,
+        times2 = self.do(mode2, self._shared, shp, nb_repeat=nb_repeat,
                         assert_len_topo=False, slice=s)
        print("times1 with local_elemwise_fusion")
        print(times1, times1.min(), times1.max(), times1.sum())
@@ -1317,7 +1265,7 @@ class test_fusion(unittest.TestCase):
              "mean", d.mean(), "std", d.std())
    def test_fusion_inplace(self):
-        mode = copy.copy(compile.mode.get_default_mode())
+        mode = copy.copy(self.mode)
        # we need the optimisation enabled and the canonicalize.
        # the canonicalize is needed to merge multiplication/addition by constant.
        mode._optimizer = mode._optimizer.including(
@@ -1332,14 +1280,9 @@ class test_fusion(unittest.TestCase):
        f(np.random.random((5, 5)), np.random.random((5, 5)),
            np.random.random((5, 5)))
-    def speed_fusion_gpu(self):
-        import theano.sandbox.cuda as cuda
-        self.speed_fusion(shared_fn=cuda.float32_shared_constructor,
-                          gpu=True, s=slice(0, 15))
    def speed_log_exp(self):
        s = slice(31, 36)
-        print("time", self.do(None, shared, shp=(1000, 1000), gpu=False,
+        print("time", self.do(self.mode, self._shared, shp=(1000, 1000),
                              assert_len_topo=False, slice=s, nb_repeat=100))
    def tes_memory_leak(self, mode=compile.mode.Mode('c', 'merge'),
@@ -1505,27 +1448,6 @@ class TestCompositeCodegen(unittest.TestCase):
        fval = f([1, 2, 3])
        assert np.all(fval == [6, 12, 18])
-    def test_nested_gpu(self):
-        import theano.sandbox.cuda as cuda
-        if not cuda.cuda_available:
-            raise SkipTest("cuda not available")
-        import theano.sandbox.cuda.opt
-        y = self.times_2(self.x)
-        z = self.times_3(y)
-        f = theano.function(
-            [self.x], cuda.gpu_from_host(z),
-            mode=theano.compile.mode.get_default_mode().including('gpu'))
-        topo = f.maker.fgraph.toposort()
-        if config.mode != "FAST_COMPILE":
-            assert len(topo) == 2
-            assert topo[1].op == cuda.gpu_from_host
-        # topo1 is doing the composite work on the CPU. Auto-generation of
-        # GPU code for ops with support code is not possible.
-        fval = np.asarray(f([1, 2, 3]))
-        assert np.all(fval == [6, 12, 18]), fval
    def test_local_useless_composite(self):
        x = theano.scalar.float32()
        c = theano.scalar.Composite([x], [x + 1, x - 1])
@@ -4213,31 +4135,6 @@ class test_shapeoptimizer(unittest.TestCase):
        f = theano.function([X], expr, mode=mode)
        print(f([[1, 2], [2, 3]]))
-    def test_no_cycle(self):
-        # Optimizing this graph resulted in a cycle, see gh-1549
-        # This test depends on cuda
-        import theano.sandbox.cuda as cuda
-        if not cuda.cuda_available:
-            raise SkipTest("cuda not available")
-        if sys.version_info[:2] < (2, 5):
-            raise SkipTest("Test skipped due to a too old python")
-        # This pickle file has undergone manual surgery due to changes
-        # in scan and may or may not run correctly.  It does passes
-        # the test below.
-        pkl_filename = os.path.join(os.path.dirname(theano.__file__),
-                                    'tensor', 'tests', 'shape_opt_cycle.pkl')
-        # Due to incompatibilities between python 2 and 3 in the format
-        # of pickled numpy ndarray, we have to force an encoding
-        from theano.misc.pkl_utils import CompatUnpickler
-        with open(pkl_filename, "rb") as pkl_file:
-            if PY3:
-                u = CompatUnpickler(pkl_file, encoding="latin1")
-            else:
-                u = CompatUnpickler(pkl_file)
-            fn_args = u.load()
-            theano.function(**fn_args)
 class test_assert(utt.InferShapeTester):

--- a/theano/tensor/tests/test_sharedvar.py
+++ b/theano/tensor/tests/test_sharedvar.py
@@ -25,7 +25,6 @@ def makeSharedTester(shared_constructor_,
                     theano_fct_,
                     ref_fct_,
                     cast_value_=np.asarray,
-                     op_by_matrix_=False,
                     name=None,
                     ):
    """
@@ -49,7 +48,6 @@ def makeSharedTester(shared_constructor_,
    :param theano_fct_: A theano op that will be used to do some computation on the shared variable
    :param ref_fct_: A reference function that should return the same value as the theano_fct_
    :param cast_value_: A callable that cast an ndarray into the internal shared variable representation
-    :param op_by_matrix_: When we do inplace operation on the an internal type object, should we do it with a scalar or a matrix of the same value.
    :param name: This string is used to set the returned class' __name__
                 attribute. This is needed for nosetests to properly tag the
                 test with its correct name, rather than use the generic
@@ -75,7 +73,6 @@ def makeSharedTester(shared_constructor_,
        set_cast_value_inplace = set_cast_value_inplace_
        shared_constructor_accept_ndarray = shared_constructor_accept_ndarray_
        cast_value = staticmethod(cast_value_)
-        op_by_matrix = op_by_matrix_
        def test_shared_dont_alias(self):
            dtype = self.dtype
@@ -96,11 +93,7 @@ def makeSharedTester(shared_constructor_,
            assert np.allclose(self.ref_fct(x), total_val)
-            values_to_div = .5
+            x /= .5
-            if self.op_by_matrix:
-                values_to_div = self.internal_type(np.ones(x.shape, dtype=dtype)/2)  # supported for cudandarray, but not ndarray.
-                assert self.test_internal_type(values_to_div)
-            x /= values_to_div
            total_val_2 = total_func()
            # value used to construct should not alias with internal
@@ -108,7 +101,7 @@ def makeSharedTester(shared_constructor_,
            x = x_shared.get_value(borrow=False)
-            x /= values_to_div
+            x /= .5
            total_val_3 = total_func()
@@ -117,7 +110,7 @@ def makeSharedTester(shared_constructor_,
            # in this case we can alias
            x = x_shared.get_value(borrow=True)
-            x /= values_to_div
+            x /= .5
            # this is not required by the contract but it is a feature we've
            # implemented for some type of SharedVariable.
@@ -189,12 +182,7 @@ def makeSharedTester(shared_constructor_,
            x = x_shared.get_value(borrow=True, return_internal_type=True)
            assert self.test_internal_type(x)
-            values_to_div = .5
+            x /= .5
-            if self.op_by_matrix:
-                # supported for cudandarray, but not ndarray.
-                values_to_div = self.internal_type(
-                    np.ones(x.shape, dtype=dtype)/2)
-            x /= values_to_div  # supported by ndarray and CudaNdarray
            # this is not required by the contract but it is a feature we can
            # implement for some type of SharedVariable.
@@ -203,7 +191,7 @@ def makeSharedTester(shared_constructor_,
            x = x_shared.get_value(borrow=False, return_internal_type=True)
            assert self.test_internal_type(x)
            assert x is not x_shared.container.value
-            x /= values_to_div  # supported by ndarray and CudaNdarray
+            x /= .5
            # this is required by the contract
            assert not np.allclose(self.ref_fct(x), total_func())
@@ -244,16 +232,10 @@ def makeSharedTester(shared_constructor_,
            total_func = theano.function([], total)
            total_func()
-            values_to_div = .5
-            if self.op_by_matrix:
-                # supported for cudandarray, but not ndarray.
-                values_to_div = self.internal_type(np.ones(x.shape, dtype=dtype)/2)
-                assert self.test_internal_type(values_to_div)
            # test if that theano shared variable optimize set_value(borrow=True)
            get_x = x_shared.get_value(borrow=True)
            assert get_x is not x_orig  # borrow=False to shared_constructor
-            get_x /= values_to_div
+            get_x /= .5
            x_shared.set_value(get_x, borrow=True)
            x = x_shared.get_value(borrow=True)
            if self.set_value_borrow_true_alias:
@@ -267,7 +249,7 @@ def makeSharedTester(shared_constructor_,
            assert get_x is not x_orig  # borrow=False to shared_constructor
            assert self.test_internal_type(get_x)
-            get_x /= values_to_div  # supported by ndarray and CudaNdarray
+            get_x /= .5
            assert self.test_internal_type(get_x)
            x_shared.set_value(get_x, borrow=True)
            x = x_shared.get_value(borrow=True, return_internal_type=True)
@@ -295,12 +277,7 @@ def makeSharedTester(shared_constructor_,
            assert np.allclose(self.ref_fct(x), total_val)
-            values_to_div = .5
+            x /= .5
-            if self.op_by_matrix:
-                # supported for cudandarray, but not ndarray.
-                values_to_div = self.internal_type(np.ones(x.shape, dtype=dtype)/2)
-                assert self.test_internal_type(values_to_div)
-            x /= values_to_div
            # not required by the contract but it is a feature we've implemented
            if self.shared_borrow_true_alias:
@@ -345,9 +322,9 @@ def makeSharedTester(shared_constructor_,
            if x.__class__.__name__ != 'csr_matrix':
                # sparse matrix don't support inplace affectation
                nd += 1
-                # THIS DON't DO WHAT WE EXPECT the contain of a is not updated for CudaNdarray, but it is for ndarray
+                # THIS DOENS'T DO WHAT WE EXPECT the content of a is
+                # not updated for GpuArray, but it is for ndarray
                x_shared.get_value(borrow=True)[:] = nd
-                #assert (np.asarray(x_shared.get_value(borrow=True))!=nd).all()
                assert may_share_memory(old_data, x_shared.container.storage[0])
                x_shared.get_value(borrow=True)
@@ -617,7 +594,6 @@ test_shared_options = makeSharedTester(
    theano_fct_=lambda a: a*2,
    ref_fct_=lambda a: np.asarray((a*2)),
    cast_value_=np.asarray,
-    op_by_matrix_=False,
    name='test_shared_options')

--- a/theano/tensor/type.py
+++ b/theano/tensor/type.py
@@ -203,10 +203,9 @@ class TensorType(Type):
        """
        Convert a symbolic Variable into a TensorType, if compatible.
-        For the moment, only a TensorType, GpuArrayType and
+        For the moment, only a TensorType and GpuArrayType will be
-        CudaNdarrayType will be
+        converted, provided they have the same number of dimensions
-        converted, provided they have the same number of dimensions and
+        and dtype and have "compatible" broadcastable pattern.
-        dtype and have "compatible" broadcastable pattern.
        """
        if hasattr(other, '_as_TensorVariable'):