Merge pull request #2247 from nouiz/opt

Opt

Merge pull request #2247 from nouiz/opt
adc97e87 · abergeron · 18fe0369 · 53712739 · adc97e87 · adc97e87
--- a/doc/faq.txt
+++ b/doc/faq.txt
@@ -72,13 +72,33 @@ and use directly the optimized graph from the pickled file.
 Faster Theano function
 ----------------------

-You can set the Theano flag `allow_gc` to `False` to get a speed-up by using
+You can set the Theano flag ``allow_gc`` to ``False`` to get a speed-up by using
 more memory. By default, Theano frees intermediate results when we don't need
 them anymore. Doing so prevents us from reusing this memory. So disabling the
 garbage collection will keep all intermediate results' memory space to allow to
 reuse them during the next call to the same Theano function, if they are of the
 correct shape. The shape could change if the shapes of the inputs change.

+.. unsafe_optimization:
+
+Unsafe optimization
+===================
+
+
+Some Theano optimizations make the assumption that the user inputs are
+valid. What this means is that if the user provides invalid values (like
+incompatible shapes or indexing values that are out of bounds) and
+the optimizations are applied, the user error will get lost. Most of the
+time, the assumption is that the user inputs are valid. So it is good
+to have the optimization being applied, but loosing the error is bad.
+The newest optimization in Theano with such assumption will add an
+assertion in the graph to keep the user error message. Computing
+these assertions could take some time. If you are sure everything is valid
+in your graph and want the fastest possible Theano, you can enable an
+optimization that will remove those assertions with:
+``optimizer_including=local_remove_all_assert``
+
+
 Faster Small Theano function
 ----------------------------


--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -460,7 +460,9 @@ import theano and print the config variable, as in:

    Default: '-lblas'

-    Link arguments to link against a (Fortran) level-3 blas implementation.
+    Link arguments to link against a (Fortran) level-3 blas
+    implementation.  The default will test if '-lblas' work. If not,
+    we will disable our c code for BLAS.

 .. attribute:: config.experimental.local_alloc_elemwise_assert


--- a/doc/library/sandbox/cuda/dnn.txt
+++ b/doc/library/sandbox/cuda/dnn.txt
@@ -51,13 +51,13 @@ Convolution Ops
 ===============

 .. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI,
+    :members: GpuDnnConvDesc, GpuDnnConv, GpuDnnConvGradW, GpuDnnConvGradI

 Pooling Ops
 ===========

 .. automodule:: theano.sandbox.cuda.dnn
-    :members: GpuDnnPoolDesc, GpuDnnPool, GpuDnnPoolGrad,
+    :members: GpuDnnPoolDesc, GpuDnnPool, GpuDnnPoolGrad

 Softmax Ops
 ===========

--- a/doc/library/tensor/nnet/conv.txt
+++ b/doc/library/tensor/nnet/conv.txt
@@ -164,8 +164,6 @@ TODO: Give examples on how to use these things! They are pretty complicated.

 .. autofunction:: theano.tensor.nnet.conv.conv2d
 .. autofunction:: theano.sandbox.cuda.fftconv.conv2d_fft
-.. autofunction:: theano.sandbox.cuda.blas.GpuCorrMM
-.. autofunction:: theano.sandbox.cuda.dnn.dnn_conv
 .. autofunction:: theano.tensor.nnet.Conv3D.conv3D
 .. autofunction:: theano.sandbox.cuda.fftconv.conv3d_fft
 .. autofunction:: theano.tensor.nnet.conv3d2d.conv3d
--- a/doc/proposals/index.txt
+++ b/doc/proposals/index.txt
@@ -12,4 +12,4 @@ Proposals for new/revised features
    noupdates
    opt_patterns2
    graphical_models
-
+    complex_gradient
--- a/doc/tutorial/extending_theano.txt
+++ b/doc/tutorial/extending_theano.txt
@@ -117,6 +117,7 @@ An op has to implement some methods defined in the the interface of

  :func:`perform` method defines the Python implementation of an op.
  It takes several arguments:
+
    - ``node`` is a reference to an Apply node which was previously
      obtained via the ``Op``'s :func:`make_node` method. It is typically not
      used in simple ops, but it contains symbolic information that
@@ -149,6 +150,7 @@ An op has to implement some methods defined in the the interface of
  It returns a thunk. A thunk is defined as a zero-arguments
  function which encapsulates the computation to be performed by an
  op on the arguments of its corresponding node. It takes several parameters:
+
    - ``node`` is the Apply instance for which a thunk is requested,
    - ``storage_map`` is a dict of lists which  maps variables to a one-element
      lists holding the variable's current value. The one-element list acts as

--- a/theano/compile/profilemode.py
+++ b/theano/compile/profilemode.py
@@ -2,6 +2,7 @@ import atexit
 import copy
 import os
 import time
+import warnings

 import theano
 from theano.gof.link import WrapLinker
@@ -98,6 +99,10 @@ class Profile_Maker(FunctionMaker):
            # Lazy import to avoid compilation when importing theano.
            from theano.gof.cutils import run_cthunk

+        warnings.warn(
+            "DEPRECATION WARNING: The ProfileMode is deprecated. Use the Theano"
+            " flags/parameter to theano.function 'profile=True' instead"
+            " of 'mode=ProfileMode'")
        return ret



--- a/theano/gof/lazylinker_c.c
+++ b/theano/gof/lazylinker_c.c
@@ -951,12 +951,12 @@ CLazyLinker_set_allow_gc(CLazyLinker *self, PyObject *value, void *closure)
 }

 static PyGetSetDef CLazyLinker_getset[] = {
-    {"allow_gc",
-     (getter)CLazyLinker_get_allow_gc,
-     (setter)CLazyLinker_set_allow_gc,
-     "do this function support allow_gc",
-     NULL},
-    {NULL, NULL, NULL, NULL}  /* Sentinel */
+  {(char*)"allow_gc",
+   (getter)CLazyLinker_get_allow_gc,
+   (setter)CLazyLinker_set_allow_gc,
+   (char*)"do this function support allow_gc",
+   NULL},
+  {NULL, NULL, NULL, NULL}  /* Sentinel */
 };
 static PyMemberDef CLazyLinker_members[] = {
    {(char*)"nodes", T_OBJECT_EX, offsetof(CLazyLinker, nodes), 0,

--- a/theano/gof/optdb.py
+++ b/theano/gof/optdb.py
@@ -32,7 +32,21 @@ class DB(object):
        self.name = None  # will be reset by register
        #(via obj.name by the thing doing the registering)

-    def register(self, name, obj, *tags):
+    def register(self, name, obj, *tags, **kwargs):
+        """
+        :param name: name of the optimizer.
+        :param obj: the optimizer to register.
+        :param tags: tag name that allow to select the optimizer.
+        :param kwargs: If non empty, should contain
+            only use_db_name_as_tag=False.
+            By default, all optimizations registered in EquilibriumDB
+            are selected when the EquilibriumDB name is used as a
+            tag. We do not want this behavior for some optimizer like
+            local_remove_all_assert. use_db_name_as_tag=False remove
+            that behavior. This mean only the optimizer name and the
+            tags specified will enable that optimization.
+
+        """
        # N.B. obj is not an instance of class Optimizer.
        # It is an instance of a DB.In the tests for example,
        # this is not always the case.
@@ -42,9 +56,12 @@ class DB(object):
            raise ValueError('The name of the object cannot be an existing'
                             ' tag or the name of another existing object.',
                             obj, name)
-
-        if self.name is not None:
-            tags = tags + (self.name,)
+        if kwargs:
+            assert "use_db_name_as_tag" in kwargs
+            assert kwargs["use_db_name_as_tag"] is False
+        else:
+            if self.name is not None:
+                tags = tags + (self.name,)
        obj.name = name
        # This restriction is there because in many place we suppose that
        # something in the DB is there only once.
@@ -155,6 +172,10 @@ class Query(object):
        if isinstance(self.exclude, (list, tuple)):
            self.exclude = OrderedSet(self.exclude)

+    def __str__(self):
+        return "Query{inc=%s,ex=%s,require=%s,subquery=%s,position_cutoff=%d}" % (
+            self.include, self.exclude, self.require, self.subquery, self.position_cutoff)
+
    #add all opt with this tag
    def including(self, *tags):
        return Query(self.include.union(tags),

--- a/theano/gof/vm.py
+++ b/theano/gof/vm.py
@@ -728,9 +728,8 @@ class VM_Linker(link.LocalLinker):
            if self.use_cloop and config.profile_memory:
                warnings.warn(
                    'CVM does not support memory profile, using Stack VM.')
-            deps = None
-            if self.allow_gc:
-                deps = self.compute_gc_dependencies(storage_map)
+            # Needed when allow_gc=True and profiling
+            deps = self.compute_gc_dependencies(storage_map)
            vm = Stack(
                nodes, thunks, pre_call_clear,
                storage_map, compute_map,
@@ -765,13 +764,11 @@ class VM_Linker(link.LocalLinker):
                assert type(storage_map_list[0]) is list
                assert type(compute_map_list[0]) is list

-            if self.allow_gc:
-                dependency_map = self.compute_gc_dependencies(storage_map)
-                dependency_map_list = [
-                    [vars_idx[d] for d in dependency_map[vars_idx_inv[i]]]
-                    for i in xrange(len(vars_idx_inv))]
-            else:
-                dependency_map_list = None
+            # Needed when allow_gc=True and profiling
+            dependency_map = self.compute_gc_dependencies(storage_map)
+            dependency_map_list = [
+                [vars_idx[d] for d in dependency_map[vars_idx_inv[i]]]
+                for i in xrange(len(vars_idx_inv))]

            # build the pointers to node inputs and offsets
            base_input_output_list = []
@@ -869,9 +866,8 @@ class VM_Linker(link.LocalLinker):
                        thunks,
                        pre_call_clear)
            else:
-                deps = None
-                if self.allow_gc:
-                    deps = self.compute_gc_dependencies(storage_map)
+                # Needed when allow_gc=True and profiling
+                deps = self.compute_gc_dependencies(storage_map)
                vm = Stack(
                    nodes, thunks, pre_call_clear,
                    storage_map, compute_map,

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -561,6 +561,7 @@ class GpuCAReduce(GpuOp):
            self.pre_scalar_op = None

    def make_node(self, x):
+        x = as_cuda_ndarray_variable(x)
        if (x.type.ndim != len(self.reduce_mask)):
            raise TypeError("x must have rank %i" % len(self.reduce_mask))
        o_broadcast = [x.type.broadcastable[i] for i

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -801,6 +801,26 @@ class BaseGpuCorrMM(GpuOp):
 class GpuCorrMM(BaseGpuCorrMM):
    """GPU correlation implementation using Matrix Multiplication.

+    :param border_mode: currently supports "valid" only; "full" can be
+        simulated by setting `pad="full"` (at the cost of performance), or
+        by using `GpuCorrMM_gradInputs`
+    :param subsample: the subsample operation applied to each output image.
+        Should be a tuple with 2 elements.
+        `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
+        but faster.
+        Set to `(1, 1)` to disable subsampling.
+    :param pad: the width of a border of implicit zeros to pad the input
+        image with. Should be a tuple with 2 elements giving the numbers of
+        rows and columns to pad on each side, or "half" to set the padding
+        to `(kernel_rows // 2, kernel_columns // 2)`, or "full" to set the
+        padding to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
+        Set to `(0, 0)` to disable padding.
+
+    :note: Currently, the Op requires the inputs, filters and outputs to be
+        C-contiguous. Use :func:`gpu_contiguous
+        <theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
+        if needed.
+
    :note: You can either enable the Theano flag `optimizer_including=conv_gemm`
        to automatically replace all convolution operations with `GpuCorrMM`
        or one of its gradients, or you can use it as a replacement for
@@ -819,29 +839,8 @@ class GpuCorrMM(BaseGpuCorrMM):
        batchsize or number of filters) may also work around the CUBLAS bug.
    """
    def __init__(self, border_mode="valid",
-            subsample=(1, 1),
-            pad=(0, 0)):
-        """
-        :param border_mode: currently supports "valid" only; "full" can be
-            simulated by setting `pad="full"` (at the cost of performance), or
-            by using `GpuCorrMM_gradInputs`
-        :param subsample: the subsample operation applied to each output image.
-            Should be a tuple with 2 elements.
-            `(sv, sh)` is equivalent to `GpuCorrMM(...)(...)[:,:,::sv, ::sh]`,
-            but faster.
-            Set to `(1, 1)` to disable subsampling.
-        :param pad: the width of a border of implicit zeros to pad the input
-            image with. Should be a tuple with 2 elements giving the numbers of
-            rows and columns to pad on each side, or "half" to set the padding
-            to `(kernel_rows // 2, kernel_columns // 2)`, or "full" to set the
-            padding to `(kernel_rows - 1, kernel_columns - 1)` at runtime.
-            Set to `(0, 0)` to disable padding.
-
-        :note: Currently, the Op requires the inputs, filters and outputs to be
-            C-contiguous. Use :func:`gpu_contiguous
-            <theano.sandbox.cuda.basic_ops.gpu_contiguous>` on these arguments
-            if needed.
-        """
+                 subsample=(1, 1),
+                 pad=(0, 0)):
        super(GpuCorrMM, self).__init__(border_mode, subsample, pad)

    def make_node(self, img, kern):

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -7,8 +7,7 @@ from theano.gof.type import CDataType
 from theano.compat import PY3
 from theano.tensor.nnet import SoftmaxGrad
 from theano.sandbox.cuda.type import CudaNdarrayType
-from theano.sandbox.cuda import (GpuOp, cuda_available, active_device_number,
-                                 device_properties)
+from theano.sandbox.cuda import (GpuOp, cuda_available)
 from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
                                           gpu_contiguous, HostFromGpu)
 from theano.sandbox.cuda.blas import (GpuConv, GpuDownsampleFactorMax,
@@ -21,8 +20,8 @@ from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler

 def dnn_available():
    if dnn_available.avail is None:
-        dev = active_device_number()
-        if device_properties(dev)['major'] < 3:
+        dev = theano.sandbox.cuda.active_device_number()
+        if theano.sandbox.cuda.device_properties(dev)['major'] < 3:
            dnn_available.msg = "Device not supported by cuDNN"
            dnn_available.avail = False
        else:
@@ -295,9 +294,9 @@ if ((err%(id)d = cudnnCreateFilterDescriptor(&kerns%(id)d)) != CUDNN_STATUS_SUCC

    def c_cleanup_code_struct(self, node, struct_id):
        return """
-cudnnDestroyTensor4dDescriptor(input%(id)d);
-cudnnDestroyTensor4dDescriptor(output%(id)d);
-cudnnDestroyFilterDescriptor(kerns%(id)d);
+if (input%(id)d != NULL) {cudnnDestroyTensor4dDescriptor(input%(id)d);}
+if (output%(id)d != NULL) {cudnnDestroyTensor4dDescriptor(output%(id)d);}
+if (kerns%(id)d != NULL) {cudnnDestroyFilterDescriptor(kerns%(id)d);}
 """ % dict(id=struct_id)

    def c_set_filter(self, var, desc, err, fail):
@@ -400,7 +399,7 @@ if (err%(name)s != CUDNN_STATUS_SUCCESS) {
           method=self.conv_op, path=self.path_flag)

    def c_code_cache_version(self):
-        return (7,)
+        return (8,)


 class GpuDnnConv(GpuDnnConvBase):

--- a/theano/sandbox/cuda/fftconv.py
+++ b/theano/sandbox/cuda/fftconv.py
@@ -48,6 +48,12 @@ class ScikitsCudaOp(GpuOp):

        return theano.Apply(self, [inp], [self.output_type(inp)()])

+    def make_thunk(self, node, storage_map, _, _2):
+        if not scikits_cuda_available:
+            raise RuntimeError(
+                "scikits.cuda is needed for all GPU fft implementation,"
+                " including fftconv.")
+

 class CuFFTOp(ScikitsCudaOp):
    def output_type(self, inp):
@@ -56,6 +62,8 @@ class CuFFTOp(ScikitsCudaOp):
            broadcastable=[False] * (inp.type.ndim + 1))

    def make_thunk(self, node, storage_map, _, _2):
+        super(CuFFTOp, self).make_thunk(node, storage_map, _, _2)
+
        from theano.misc.pycuda_utils import to_gpuarray
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
@@ -111,6 +119,8 @@ class CuIFFTOp(ScikitsCudaOp):
            broadcastable=[False] * (inp.type.ndim - 1))

    def make_thunk(self, node, storage_map, _, _2):
+        super(CuIFFTOp, self).make_thunk(node, storage_map, _, _2)
+
        from theano.misc.pycuda_utils import to_gpuarray
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]
@@ -300,6 +310,8 @@ class BatchedComplexDotOp(ScikitsCudaOp):
        return CudaNdarrayType(broadcastable=[False] * inp.type.ndim)

    def make_thunk(self, node, storage_map, _, _2):
+        super(BatchedComplexDotOp, self).make_thunk(node, storage_map, _, _2)
+
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]


--- a/theano/sandbox/cuda/tests/test_nnet.py
+++ b/theano/sandbox/cuda/tests/test_nnet.py
@@ -14,11 +14,13 @@ if cuda.cuda_available == False:

 if theano.config.mode == 'FAST_COMPILE':
    mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
-    mode_without_gpu = theano.compile.mode.get_mode(
-        'FAST_RUN').excluding('gpu')
+    # We should not exclude the 'gpu' tag, as some CPU opt are tagged
+    # as GPU to make them run in fast_compile with gpu.
+
+    mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN')
 else:
    mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
-    mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu')
+    mode_without_gpu = theano.compile.mode.get_default_mode()


 def test_GpuCrossentropySoftmaxArgmax1HotWithBias():

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1171,14 +1171,22 @@ class ShapeFeature(object):
                    self.set_shape_i(v, ii, new_r)
        self.shape_of_reverse_index[r] = set()

-    def same_shape(self, x, y):
+    def same_shape(self, x, y, dim_x=None, dim_y=None):
        """Return True if we are able to assert that x and y have the
-        same shape
+        same shape.
+
+        dim_x and dim_y are optional. If used, they should be an index
+        to compare only 1 shape of x or y.
+
        """
        sx = self.shape_of[x]
        sy = self.shape_of[y]
        if sx is None or sy is None:
            return False
+        if dim_x is not None:
+            sx = [sx[dim_x]]
+        if dim_y is not None:
+            sy = [sy[dim_y]]
        assert len(sx) == len(sy)

        for dx, dy in zip(sx, sy):
@@ -1449,6 +1457,29 @@ def local_alloc_unary(node):
            return [T.alloc(T.cast(v, node.outputs[0].dtype), *shp)]


+@register_canonicalize
+@register_specialize
+@gof.local_optimizer([T.Elemwise])
+def local_cast_cast(node):
+    """cast(cast(x, dtype1), dtype2)
+
+    when those contrain:
+    dtype1 == dtype2
+    TODO: the base dtype is the same (int, uint, float, complex)
+          and the first cast cause an upcast.
+    """
+    if (not isinstance(node.op, T.Elemwise) or
+        not isinstance(node.op.scalar_op, scalar.Cast)):
+        return
+    x = node.inputs[0]
+    if (not x.owner or
+        not isinstance(x.owner.op, T.Elemwise) or
+        not isinstance(x.owner.op.scalar_op, scalar.Cast)):
+        return
+    if node.op.scalar_op.o_type == x.owner.op.scalar_op.o_type:
+        return [x]
+
+
 class Assert(T.Op):
    """
    Implements assertion in a computational graph.
@@ -1551,9 +1582,32 @@ def local_remove_useless_assert(node):
            return [assert_(node.inputs[0], *cond)]


+@gof.local_optimizer([Assert])
+def local_remove_all_assert(node):
+    """An optimization disabled by default that removes all asserts from
+    the graph.
+
+    :note: See the :ref:`unsafe` section to know how to enable it.
+
+    """
+    if not isinstance(node.op, Assert):
+        return
+
+    return [node.inputs[0]]
+# Disabled by default
+compile.optdb['canonicalize'].register('local_remove_all_assert',
+                                       local_remove_all_assert,
+                                       use_db_name_as_tag=False)
+compile.optdb['stabilize'].register('local_remove_all_assert',
+                                    local_remove_all_assert,
+                                    use_db_name_as_tag=False)
+compile.optdb['specialize'].register('local_remove_all_assert',
+                                     local_remove_all_assert,
+                                     use_db_name_as_tag=False)
+
 @register_specialize
 @gof.local_optimizer([T.Elemwise])
-def local_alloc_elemwise(node):
+def local_elemwise_alloc(node):
    """
    elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
      -> elemwise(x, y.TensorType(BROADCAST CONDITION))
@@ -1692,12 +1746,14 @@ theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
                                     is_valid=lambda x: x
                                 ),
                                 in_c_key=False)
-#This version if faster but not as safe.
-theano.configparser.AddConfigVar('experimental.local_alloc_elemwise_assert',
-        "If False enable the experimental optimization local_alloc_elemwise"
-                                 " but WITHOUT assert into the graph!",
-        theano.configparser.BoolParam(True),
-        in_c_key=False)
+
+# False could make the graph faster but not as safe.
+theano.configparser.AddConfigVar(
+    'experimental.local_alloc_elemwise_assert',
+    "When the local_alloc_elemwise is applied, add"
+    " an assert to highlight shape errors.",
+    theano.configparser.BoolParam(True),
+    in_c_key=False)

 ############################
 # Constant Canonicalization
@@ -2452,6 +2508,48 @@ def local_setsubtensor_of_constants(node):
            return False


+@register_canonicalize
+@register_stabilize
+@gof.local_optimizer([AdvancedSubtensor1])
+def local_adv_sub1_adv_inc_sub1(node):
+    """Optimize the possible AdvSub1(AdvIncSub1(...), ...)
+
+    AdvancedSubtensor1(AdvancedIncSubtensor1(0s, y, idx), idx) -> y
+    AdvancedSubtensor1(AdvancedSetSubtensor1(x, y, idx), idx) -> y
+
+    :note: This opt add AssertOp. Otherwise, it would remove shape and
+        index error. If you want to get rid of them, see the
+        :ref:`unsafe_optimization` section.
+
+    """
+    if not isinstance(node.op, AdvancedSubtensor1):
+        return
+    inp = node.inputs[0]
+    if (not inp.owner or
+        not isinstance(inp.owner.op, AdvancedIncSubtensor1)):
+        return
+    idx = node.inputs[1]
+    idx2 = inp.owner.inputs[2]
+    x = inp.owner.inputs[0]
+    y = inp.owner.inputs[1]
+    if idx is not idx2:
+        return
+    if (not inp.owner.op.set_instead_of_inc and
+        T.extract_constant(x) != 0):
+        return
+    cond = [T.all(T.and_(T.lt(idx, x.shape[0]),
+                        T.ge(idx, -x.shape[0])))]
+    if not node.fgraph.shape_feature.same_shape(idx, y, 0, 0):
+        cond.append(T.eq(idx.shape[0], y.shape[0]))
+    y = Assert("Bad indexing or shapes in a AdvancedIncSubtensor1 that was optimized away")(y, *cond)
+
+    if y.dtype == node.outputs[0].dtype:
+        return [y]
+    # It is possible that y is upcast or downcast to x.dtype.
+    # In all case, as we set or add with 0, we can just cast y.
+    return [T.cast(y, node.outputs[0].dtype)]
+
+
 ####################
 # Rebroadcast opts #
 ####################

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2417,6 +2417,84 @@ class test_local_subtensor_merge(unittest.TestCase):
                        f(x_val, *i_val)


+class test_local_adv_sub1_adv_inc_sub1(unittest.TestCase):
+    def setUp(self):
+        utt.seed_rng()
+        mode = theano.compile.mode.get_default_mode()
+        self.mode = mode.including("local_adv_sub1_adv_inc_sub1").excluding("fusion")
+        self.mode_no_assert = self.mode.including("local_remove_all_assert")
+
+    def test0(self):
+        for dtype1, dtype2 in [("float32", "float32"),
+                               ("float32", "float64"),
+                               ("float64", "float32"),
+                               ("float64", "float64")]:
+            x = tensor.matrix(dtype=dtype1)
+            y = tensor.matrix(dtype=dtype2)
+            idx = tensor.ivector()
+
+            dx = numpy.random.rand(4, 5).astype(dtype1)
+            dy = numpy.random.rand(2, 5).astype(dtype2)
+            didx = numpy.asarray([1, 3], "int32")
+
+            # set_subtensor
+            inc = tensor.set_subtensor(x[idx], y)
+            o = inc[idx]
+            f = theano.function([x, y, idx], o, self.mode_no_assert)
+
+            res = f(dx, dy, didx)
+            assert numpy.allclose(dy, res)
+            topo = f.maker.fgraph.toposort()
+            if opt:
+                assert len(topo) == 1
+                assert isinstance(topo[0].op, (compile.DeepCopyOp, T.Elemwise))
+            else:
+                assert len(topo) == 2
+
+            # inc_subtensor(data[idx], y)
+            inc = tensor.inc_subtensor(x[idx], y)
+            o = inc[idx]
+            f = theano.function([x, y, idx], o, self.mode_no_assert)
+
+            res = f(dx, dy, didx)
+            assert numpy.allclose((dx[didx] + dy), res)
+            topo = f.maker.fgraph.toposort()
+            len(topo) == 2
+
+            # inc_subtensor(0[idx], y)
+            inc = tensor.inc_subtensor(x.zeros_like()[idx], y)
+            o = inc[idx]
+            f = theano.function([x, y, idx], o, self.mode_no_assert)
+
+            res = f(dx, dy, didx)
+            assert numpy.allclose(dy, res)
+            topo = f.maker.fgraph.toposort()
+            if opt:
+                assert len(topo) == 1
+                assert isinstance(topo[0].op, (compile.DeepCopyOp, T.Elemwise))
+            else:
+                assert len(topo) > 2
+
+    def test_assert(self):
+            x = tensor.matrix("x")
+            y = tensor.matrix("y")
+            idx = tensor.ivector()
+
+            dx = numpy.random.rand(4, 5).astype(config.floatX)
+            dy = numpy.random.rand(2, 5).astype(config.floatX)
+            didx = numpy.asarray([1, 3], "int32")
+
+            # set_subtensor
+            inc = tensor.set_subtensor(x[idx], y)
+            o = inc[idx]
+            f = theano.function([x, y, idx], o, self.mode)
+            # test wrong index
+            for i in [dx.shape[0], -dx.shape[0] - 1]:
+                self.assertRaises(AssertionError, f, dx, dy, [i, i])
+            # test wrong shape
+            self.assertRaises(AssertionError, f, dx, dy, [1])
+
+
 class Test_alloc_zero(unittest.TestCase):
    def setUp(self):
        mode = theano.compile.mode.get_default_mode()
@@ -2653,7 +2731,7 @@ def test_local_subtensor_of_dot():
    assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1:4,:,1:,1])


-class Test_local_alloc_elemwise(unittest.TestCase):
+class Test_local_elemwise_alloc(unittest.TestCase):
    dtype = config.floatX

    def setUp(self):
@@ -3166,8 +3244,8 @@ class test_assert(utt.InferShapeTester):
        f(1, 1)
        self.assertRaises(AssertionError, f, 1, 0)

-    def test1(self):
-        #remove assert that are always true
+    def test_local_remove_useless_assert1(self):
+        # remove assert that are always true
        mode = theano.config.mode
        if mode == 'FAST_COMPILE':
            mode = 'FAST_RUN'
@@ -3181,8 +3259,8 @@ class test_assert(utt.InferShapeTester):
        assert len(topo) == 1
        assert topo[0].op == deep_copy_op

-    def test2(self):
-        #remove assert condition that are always true
+    def test_test_local_remove_useless_assert2(self):
+        # remove assert condition that are always true
        mode = theano.config.mode
        if mode == 'FAST_COMPILE':
            mode = 'FAST_RUN'
@@ -3199,8 +3277,8 @@ class test_assert(utt.InferShapeTester):
        assert len(topo[0].inputs) == 2
        assert topo[1].op == deep_copy_op

-    def test3(self):
-        #don't remove assert condition that are always false
+    def test_local_remove_useless_assert3(self):
+        # don't remove assert condition that are always false
        mode = theano.config.mode
        if mode == 'FAST_COMPILE':
            mode = 'FAST_RUN'
@@ -3216,6 +3294,22 @@ class test_assert(utt.InferShapeTester):
        assert len(topo[0].inputs) == 3
        assert topo[1].op == deep_copy_op

+    def test_local_remove_all_assert1(self):
+        # remove assert condition that are unknown
+        mode = theano.config.mode
+        if mode == 'FAST_COMPILE':
+            mode = 'FAST_RUN'
+        mode = compile.mode.get_mode(mode).including('local_remove_all_assert')
+
+        x = T.scalar()
+        y = T.scalar()
+        f = theano.function([x, y], theano.tensor.opt.assert_op(x, y),
+                            mode=mode)
+        f(1, 0)  # Without opt, it should fail.
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1, topo
+        assert topo[0].op == deep_copy_op, topo
+
    def test_infer_shape(self):

        adscal = dscalar()
@@ -3541,6 +3635,31 @@ class T_useless_elemwise(unittest.TestCase):
        assert topo[0].op == deep_copy_op


+class T_cast_cast(unittest.TestCase):
+    def setUp(self):
+        mode = theano.compile.get_default_mode()
+        self.mode = mode.including('local_cast_cast')
+
+    def test(self):
+        x = T.fmatrix()
+        o = T.Elemwise(scal.Cast(scal.Scalar("float64")))(x.astype("float64"))
+        f = theano.function([x], o, mode=self.mode)
+        dx = numpy.random.rand(5, 4).astype("float32")
+        f(dx)
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert isinstance(topo[0].op, T.Elemwise)
+
+        x = T.dmatrix()
+        o = T.Elemwise(scal.Cast(scal.Scalar("float32")))(x.astype("float32"))
+        f = theano.function([x], o, mode=self.mode)
+        dx = numpy.random.rand(5, 4)
+        f(dx)
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert isinstance(topo[0].op, T.Elemwise)
+
+
 def test_constant_folding():
    """ Test that constant folding get registered at fast_compile