Merge branch 'master' of https://github.com/Theano/Theano into mastery

Conflicts: theano/tensor/tests/test_basic.py

Merge branch 'master' of https://github.com/Theano/Theano into mastery
14c219b2 · Dustin Webb · c8e7eea6 · 2036661e · 14c219b2 · 14c219b2
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -188,7 +188,7 @@ import theano and print the config variable, as in:
    String value: either 'ignore', 'warn', 'raise' or 'pdb'
-    Default: 'float64'
+    Default: 'ignore'
    When creating a TensorVariable with dtype float64, what should be done?
    This is useful to help find upcast to float64 in user code.

--- a/theano/compile/tests/test_profiling.py
+++ b/theano/compile/tests/test_profiling.py
@@ -28,7 +28,7 @@ def test_profiling():
        p = theano.ProfileStats(False)
-        if theano.config.mode in ["DebugMode", "DEBUG_MODE"]:
+        if theano.config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]:
            m = "FAST_RUN"
        else:
            m = None

--- a/theano/gof/cc.py
+++ b/theano/gof/cc.py
@@ -1466,7 +1466,7 @@ class _CThunk(object):
        # note that the failure code is distributed in two lists
        if failure_code < 2 * n:
            return [self.init_tasks, self.tasks][
-                failure_code % 2][failure_code / 2]
+                failure_code % 2][failure_code // 2]
        else:
            return self.tasks[failure_code - n]

--- a/theano/gof/op.py
+++ b/theano/gof/op.py
@@ -663,7 +663,10 @@ class Op(utils.object2, PureOp, CLinkerOp):
            if len(self.__props__) == 0:
                return "%s" % (self.__class__.__name__,)
            else:
-                return "%s{%s}" % (self.__class__.__name__, ", ".join("%s=%r" % (p, getattr(self, p)) for p in self.__props__))
+                return "%s{%s}" % (
+                    self.__class__.__name__,
+                    ", ".join("%s=%r" % (p, getattr(self, p))
+                              for p in self.__props__))
        else:
            return super(Op, self).__str__()

--- a/theano/gof/tests/test_op.py
+++ b/theano/gof/tests/test_op.py
@@ -132,17 +132,21 @@ class TestOp:
    def test_op_struct(self):
        sop = StructOp()
        c = sop(theano.tensor.constant(0))
-        f = theano.function([], c)
+        mode = None
+        if theano.config.mode == 'FAST_COMPILE':
+            mode = 'FAST_RUN'
+        f = theano.function([], c, mode=mode)
        rval = f()
        assert rval == 0
        rval = f()
        assert rval == 1
        c2 = sop(theano.tensor.constant(1))
-        f2 = theano.function([], [c, c2])
+        f2 = theano.function([], [c, c2], mode=mode)
        rval = f2()
        assert rval == [0, 0]
 class TestMakeThunk(unittest.TestCase):
    def test_no_c_code(self):
        class IncOnePython(Op):

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -2888,7 +2888,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
            returns a C code expression to copy source into view, and
            return 0 on success
        """
-        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
+        # On the CPU it unbroadcast based on the run time shapes. We
+        # need the same behavior on the GPU.
+        return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s, 1)""" % locals()
    def add_to_zview(self, name, x, fail):
@@ -2910,7 +2912,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
    def c_code_cache_version(self):
        parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
        if parent_version:
-            return parent_version + (0,)
+            return parent_version + (1,)
        return ()
@@ -3343,6 +3345,13 @@ class GpuContiguous(GpuOp):
        input = as_cuda_ndarray_variable(input)
        return Apply(self, [input], [input.type()])
+    def perform(self, node, inp, out):
+        i = inp[0]
+        if not i.is_c_contiguous():
+            i = i.copy()
+        assert i.is_c_contiguous()
+        out[0][0] = i
    def c_code(self, node, name, inp, out, sub):
        input, = inp
        z, = out

--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -852,8 +852,11 @@ class GpuCorrMM(BaseGpuCorrMM):
 class GpuCorrMM_gradWeights(BaseGpuCorrMM):
    """Gradient wrt. filters for `GpuCorrMM`.
-    :note: You will not want to use this directly, but rely on Theano's
+    :note: You will not want to use this directly, but rely on
-    automatic differentiation or graph optimization to use it as needed."""
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+    """
    def __init__(self, border_mode="valid",
            subsample=(1, 1),
@@ -906,8 +909,11 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
 class GpuCorrMM_gradInputs(BaseGpuCorrMM):
    """Gradient wrt. inputs for `GpuCorrMM`.
-    :note: You will not want to use this directly, but rely on Theano's
+    :note: You will not want to use this directly, but rely on
-    automatic differentiation or graph optimization to use it as needed."""
+           Theano's automatic differentiation or graph optimization to
+           use it as needed.
+    """
    def __init__(self, border_mode="valid",
            subsample=(1, 1),

--- a/theano/sandbox/cuda/cuda_ndarray.cu
+++ b/theano/sandbox/cuda/cuda_ndarray.cu
@@ -1002,7 +1002,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
            return NULL;
        indices = (CudaNdarray*) CudaNdarray_New();
-        if (verbose) printf("ndarray after new\n");
+        if (verbose) printf("\nndarray after new\n");
        if (! indices){
            Py_DECREF(indices_float32);
            return NULL;
@@ -1140,6 +1140,13 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
    }
    dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
+    if(CudaNdarray_HOST_DIMS(out)[0] == 0){
+        // We take 0 elements, so no need for the rest of the code.
+        // This speed up that case AND fix crash otherwise.
+        free(dims);
+        Py_DECREF(indices);
+        return (PyObject *)out;
+    }
    switch (self->nd) {
        case 1:
@@ -1149,7 +1156,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                    printf("cudaGetLastError=%d, nd=%d"
                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
-                           self->nd, cudaGetLastError(),
+                           cudaGetLastError(), self->nd,
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0],
@@ -1205,7 +1212,7 @@ CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
                    printf("cudaGetLastError=%d, nd=%d"
                           " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
                           " n_threads.x=%i, n_threads.y=%i)\n",
-                           self->nd, cudaGetLastError(),
+                           cudaGetLastError(), self->nd,
                           n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
                k3<<<n_blocks, n_threads>>>(
                        dims[0], //dimensions

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -1680,6 +1680,16 @@ def local_gpualloc(node):
        return [new_out]
+@register_opt()
+@local_optimizer([theano.tensor.opt.Assert])
+def local_assert(node):
+    if (isinstance(node.op, theano.tensor.opt.Assert) and
+        node.inputs[0].owner and
+        isinstance(node.inputs[0].owner.op,
+                   HostFromGpu)):
+        return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0]))]
 @register_opt()
 @local_optimizer([GpuAlloc])
 def local_gpualloc_memset_0(node):

--- a/theano/sandbox/cuda/tests/test_basic_ops.py
+++ b/theano/sandbox/cuda/tests/test_basic_ops.py
@@ -967,6 +967,8 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
        # version when we should. Users should not use it.
        for shape, idx, fast in [((70000,), range(70000), True),
                                 ((70000, 5), range(70000), True),
+                                 ((70000, 5),  numpy.zeros((0,), 'int64'),
+                                  True),
                                 ((70000, 2, 3), range(70000), True),
                                 ((1025, 1025), [5, 10], True),
                                 ((3, 1025, 1026), [1, 2], True),

--- a/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
+++ b/theano/sandbox/cuda/tests/test_conv_cuda_ndarray.py
@@ -641,7 +641,8 @@ def test_valid(conv_gemm=False):
        shapes += get_shapes2(scales_img=(2, 2), img_stride=(2, 2))
        shapes += get_shapes2(scales_kern=(2, 2), kern_stride=(2, 2))
    else:
-        mode = cls = None
+        mode = theano_mode
+        cls = None
    exec_conv(version, shapes, verbose, random, 'valid',
              print_=print_, ones=ones, rtol=1.1e-5,
              theano_mode=mode, cls=cls)
@@ -717,7 +718,8 @@ def test_full(conv_gemm=False):
        # dummy version; not used by GpuCorrMM so one version is enough
        version = [-1]
    else:
-        mode = cls = None
+        mode = theano_mode
+        cls = None
    exec_conv(version, shapes, verbose, random, 'full',
              theano_mode=mode, cls=cls)
@@ -757,7 +759,8 @@ def test_subsample(conv_gemm=False):
        # dummy version; not used by GpuCorrMM so one version is enough
        version_valid = version_full = [-1]
    else:
-        mode = cls = None
+        mode = theano_mode
+        cls = None
    exec_conv(version_valid, shapes, verbose, random, 'valid',
              print_=print_, ones=ones,

--- a/theano/sandbox/cuda/tests/test_opt.py
+++ b/theano/sandbox/cuda/tests/test_opt.py
@@ -41,6 +41,17 @@ def test_no_shared_var_graph():
    assert numpy.any(isinstance(x.op,cuda.GpuFromHost) for x in l)
    assert numpy.any(isinstance(x.op,cuda.HostFromGpu) for x in l)
+def test_local_assert():
+    x = theano.tensor.fmatrix()
+    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
+    f = theano.function([x], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
+    assert len(a_op) == 1
+    assert isinstance(a_op[0].inputs[0].type, CudaNdarrayType)
 def test_int_pow():
    a = CudaNdarrayType([False])()

--- a/theano/sandbox/gpuarray/opt.py
+++ b/theano/sandbox/gpuarray/opt.py
@@ -496,6 +496,16 @@ def local_gpua_softmaxwithbias(node):
    return GpuSoftmaxWithBias()
+@register_opt()
+@local_optimizer([theano.tensor.opt.Assert])
+def local_assert(node):
+    if (isinstance(node.op, theano.tensor.opt.Assert) and
+        node.inputs[0].owner and
+        isinstance(node.inputs[0].owner.op,
+                   HostFromGpu)):
+        return [host_from_gpu(node.op(node.inputs[0].owner.inputs[0]))]
 @register_opt()
 @op_lifter([gpu_from_host, ConvOp])
 def local_gpu_conv(node):

--- a/theano/sandbox/gpuarray/tests/test_opt.py
+++ b/theano/sandbox/gpuarray/tests/test_opt.py
@@ -16,6 +16,16 @@ from theano.tests.unittest_tools import SkipTest
 from theano.tensor.tests.test_basic import TestSpecifyShape
+def test_local_assert():
+    x = theano.tensor.fmatrix()
+    a = theano.tensor.opt.assert_op(x, theano.tensor.eq(x, 0).any())
+    f = theano.function([x], a, mode=mode_with_gpu)
+    topo = f.maker.fgraph.toposort()
+    a_op = [n for n in topo if isinstance(n.op, theano.tensor.opt.Assert)]
+    assert len(a_op) == 1
+    assert isinstance(a_op[0].inputs[0].type, GpuArrayType)
 def test_flatten():
    m = theano.tensor.fmatrix()
    f = theano.function([m], m.flatten(), mode=mode_with_gpu)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -164,10 +164,15 @@ def as_tensor_variable(x, name=None, ndim=None):
            return x
        else:
            if (x.type.ndim > ndim):
-                # TODO: strip off leading broadcastable dimensions
+                # strip off leading broadcastable dimensions
+                first_non_broadcastable = [idx for idx in range(x.ndim)
+                                           if x.broadcastable[idx] == False][0]
+                x = x.dimshuffle(range(x.ndim)[first_non_broadcastable:])
+                if x.ndim > ndim:
                    raise ValueError(
-                    'TensorType could not be cast to have %i dimensions' %
+                        'TensorType could not be cast to have %i dimensions' % ndim, x.type
-                    ndim, x.type)
+                    )
+                return x
            elif (x.type.ndim < ndim):
                return shape_padleft(x, n_ones=(ndim - x.type.ndim))
            else:

--- a/theano/tensor/nnet/conv.py
+++ b/theano/tensor/nnet/conv.py
@@ -811,8 +811,8 @@ class ConvOp(OpenMPOp):
                shuffled_kerns.name = 'shuffled_for_conv3D(%s)' % flipped_kerns.name
            tmp_node = theano.tensor.nnet.conv3D(
-                V = shuffled_inputs,
+                V=shuffled_inputs,
-                W= shuffled_kerns,
+                W=shuffled_kerns,
                b=theano.tensor.alloc(numpy.asarray(0, dtype=kerns.dtype),
                                      kerns.shape[0]),
                d=(self.dx, self.dy, 1))

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -799,7 +799,21 @@ class ShapeFeature(object):
            #
            # worst case, we loop over shape_of and replace things
            raise NotImplementedError(s_i)
-        elif s_i.type.dtype[:3] in ('int', 'uint'):
+        # s_i is x.shape[i], we change it to Shape_i.
+        if (s_i.owner and
+            isinstance(s_i.owner.op, Subtensor) and
+            s_i.owner.inputs[0].owner and
+            isinstance(s_i.owner.inputs[0].owner.op, T.Shape)):
+            assert s_i.ndim == 0
+            assert len(s_i.owner.inputs) == 2
+            try:
+                i = get_scalar_constant_value(s_i.owner.inputs[1])
+                s_i = Shape_i(i)(s_i.owner.inputs[0].owner.inputs[0])
+            except NotScalarConstantError:
+                pass
+        if s_i.type.dtype[:3] in ('int', 'uint'):
            if getattr(s_i.type, 'ndim', 0):
                raise TypeError('Shape element must be scalar', s_i)
            return s_i
@@ -1131,6 +1145,40 @@ class ShapeFeature(object):
                    self.set_shape_i(v, ii, new_r)
        self.shape_of_reverse_index[r] = set()
+    def same_shape(self, x, y):
+        """Return True if we are able to assert that x and y have the
+        same shape
+        """
+        sx = self.shape_of[x]
+        sy = self.shape_of[y]
+        if sx is None or sy is None:
+            return False
+        assert len(sx) == len(sy)
+        for dx, dy in zip(sx, sy):
+            if dx is dy:
+                continue
+            # Need to try to find that they are the same shape. We
+            # need to compare the full graph. It could be slow. So I
+            # just implement for now the case of Shape_i.
+            if not dx.owner or not dy.owner:
+                return False
+            if (not isinstance(dx.owner.op, Shape_i) or
+                not isinstance(dy.owner.op, Shape_i)):
+                return False
+            opx = dx.owner.op
+            opy = dy.owner.op
+            if not (opx.i == opy.i):
+                return False
+            # FB I'm not sure is this handle correctly constants.
+            if dx.owner.inputs[0] == dy.owner.inputs[0]:
+                return True
+            # To be sure to cover all case, call equal_computation.
+            # Can't use theano.gof.graph.is_same_graph(dx, dy)
+            # As it currently expect that dx and dy aren't in a FunctionGraph
+            from theano.scan_module.scan_utils import equal_computations
+            return equal_computations([dx], [dy])
 class ShapeOptimizer(Optimizer):
    """Optimizer that serves to add ShapeFeature as an fgraph feature.
@@ -1640,6 +1688,54 @@ def local_upcast_elemwise_constant_inputs(node):
 ##################
+@register_canonicalize
+@register_specialize
+@gof.local_optimizer([IncSubtensor])
+def local_useless_inc_subtensor(node):
+    """Remove IncSubtensor, when we overwrite the full inputs with the
+    new value.
+    """
+    if not isinstance(node.op, IncSubtensor):
+        return
+    if node.op.set_instead_of_inc is False:
+        # This is an IncSubtensor, so the init value must be zeros
+        try:
+            c = get_scalar_constant_value(node.inputs[0])
+            if c != 0:
+                return
+        except NotScalarConstantError:
+            return
+    if (node.inputs[0].ndim != node.inputs[1].ndim or
+        node.inputs[0].broadcastable != node.inputs[1].broadcastable):
+        # FB: I didn't check if this case can happen, but this opt
+        # don't support it.
+        return
+    # We have a SetSubtensor or an IncSubtensor on zeros
+    # If is this IncSubtensor useful?
+    # Check that we keep all the original data.
+    # Put the constant inputs in the slice.
+    idx_cst = theano.tensor.subtensor.get_idx_list(node.inputs[1:],
+                                                   node.op.idx_list)
+    if all(isinstance(e, slice) and e.start is None and
+           e.stop is None and (e.step is None or T.extract_constant(e.step) == -1)
+           for e in idx_cst):
+        # IncSubtensor broadcast node.inputs[1] on node.inputs[0]
+        # based on run time shapes, so we must check they are the same.
+        if not hasattr(node.fgraph, 'shape_feature'):
+            return
+        if not node.fgraph.shape_feature.same_shape(node.inputs[0],
+                                                    node.inputs[1]):
+            return
+        # There is no reverse, so we don't need a replacement.
+        if all(e.step is None
+               for e in node.op.idx_list):
+            # They are the same shape, so we can remore this IncSubtensor
+            return [node.inputs[1]]
+        return [Subtensor(node.op.idx_list)(*node.inputs[1:])]
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([Subtensor])
@@ -3366,11 +3462,17 @@ ALL_REDUCE = [T.elemwise.CAReduce, T.elemwise.All, T.elemwise.Any,
              T.elemwise.Sum, T.elemwise.Prod,
              T.elemwise.ProdWithoutZeros]
 @register_canonicalize
 @register_uncanonicalize  # Needed for MaxAndArgmax -> CAReduce
 @gof.local_optimizer(ALL_REDUCE)
 def local_reduce_join(node):
-    """Max(Join(a,b), axis=0) -> Maximum(a,b)  """
+    """Reduce{scalar.op}(Join(a, b), axis=0) -> Elemwise{scalar.op}(a, b)
+    :note: supported scalar.op are Maximum, Mimimum in some cases and
+    Add and Mul in all cases.
+    """
    if (isinstance(node.op, T.CAReduce) and
        node.inputs[0].owner and
        isinstance(node.inputs[0].owner.op, T.Join)):
@@ -3385,6 +3487,9 @@ def local_reduce_join(node):
                return
        elif not isinstance(node.op.scalar_op, (scalar.Add, scalar.Mul)):
            return
+        elif len(join.inputs) <= 2:
+            # This is a useless join, that will get removed by another opt.
+            return
        new_inp = []
        for inp in join.inputs[1:]:

--- a/theano/tensor/slinalg.py
+++ b/theano/tensor/slinalg.py
@@ -155,7 +155,7 @@ class Solve(Op):
        self.overwrite_b = overwrite_b
    def __repr__(self):
-        return 'Solve{%s}' % str(self.props())
+        return 'Solve{%s}' % str(self._props())
    def make_node(self, A, b):
        assert imported_scipy, (

--- a/theano/tensor/tests/test_basic.py
+++ b/theano/tensor/tests/test_basic.py
@@ -1930,7 +1930,8 @@ class ApplyDefaultTestOp(theano.Op):
 class TestAsTensorVariable(unittest.TestCase):
    """
-    Unit test for ensuring that as_tensor_variable handles Apply objects correctly.
+    Unit test for ensuring that as_tensor_variable handles Apply objects
+    correctly and removes leading broadcastable dimensions when possible.
    """
    def setUp(self):
        self.x = tensor.scalar('x')
@@ -1951,6 +1952,18 @@ class TestAsTensorVariable(unittest.TestCase):
        bad_apply_var = ApplyDefaultTestOp([0, 1]).make_node(self.x)
        self.assertRaises(AttributeError, as_tensor_variable, bad_apply_var)
+    def test_strip_leading_broadcastable(self):
+        x = tensor.TensorType(config.floatX, (True, False))('x')
+        x = as_tensor_variable(x, ndim=1)
+        assert(x.ndim == 1)
+        x = tensor.matrix('x', dtype=config.floatX)
+        try:
+            x = as_tensor_variable(x, ndim=1)
+            assert(False)  # The call above should have failed
+        except ValueError:
+            pass
 class TestAlloc(unittest.TestCase):
    dtype = config.floatX

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -1571,6 +1571,53 @@ def test_log_add():
    #TODO: (write and) test that the optimization works with Sum in addition to working with Add.
+def test_local_useless_inc_subtensor():
+    x = tensor.matrix('x')
+    y = tensor.matrix('y')
+    for sub in [slice(None), slice(None, None, -1)]:
+        o = tensor.set_subtensor(x[::, sub], y)
+        f = theano.function([x, y], o)
+        o_shape = tensor.set_subtensor(x[::, sub],
+                                       tensor.specify_shape(y, x.shape))
+        f_shape = theano.function([x, y], o_shape)
+        # Test with shape info
+        topo = f_shape.maker.fgraph.toposort()
+        assert not any(isinstance(n.op, tensor.IncSubtensor) for n in topo)
+        out = f_shape([[2, 3]], [[3, 4]])
+        assert (out == numpy.asarray([[3, 4]])[::, sub]).all()
+        # Test that without shape info, we don't apply the opt.
+        topo = f.maker.fgraph.toposort()
+        assert len(topo) == 1
+        assert isinstance(topo[0].op, tensor.IncSubtensor)
+        out = f([[2, 3]], [[3, 4]])
+        assert (out == numpy.asarray([[3, 4]])[::, sub]).all()
+        # Test that we don't remove shape error
+        try:
+            f([[2, 3]], [[3, 4], [4, 5]])
+            assert False
+        except (ValueError, AssertionError):
+            pass
+        # Test that we don't remove broadcastability
+        out = f([[2, 3], [3, 4]], [[5, 6]])
+        assert (out == numpy.asarray([[5, 6], [5, 6]])[::, sub]).all()
+    # Test that we do not optimize others strides even when sub and y
+    # have same shapes
+    sub = x[::, ::2]
+    o_shape = tensor.set_subtensor(sub,
+                                   tensor.specify_shape(y, sub.shape))
+    f_shape = theano.function([x, y], o_shape)
+    topo = f_shape.maker.fgraph.toposort()
+    theano.printing.debugprint(f_shape)
+    assert any(isinstance(n.op, tensor.IncSubtensor) for n in topo)
+    out = f_shape([[2, 3, 6, 7]], [[8, 9]])
+    assert (out == numpy.asarray([[8, 3, 9, 7]])).all()
 def test_local_useless_subtensor():
    x = tensor.matrix('x')
@@ -2887,10 +2934,13 @@ class T_Tile(unittest.TestCase):
    def test_local_useless_tile(self):
        v = T.vector()
        m = T.matrix()
+        mode = None
+        if theano.config.mode == "FAST_COMPILE":
+            mode = "FAST_RUN"
        for var, data in [(v, [1, 2, 3]), (m, [[1, 2], [3, 4]])]:
            # Currently, only a repeat patter == ndim is supported.
            for ndim in [var.ndim]:  # range(1, var.ndim):
-                f = theano.function([var], T.tile(var, (1,)*ndim))
+                f = theano.function([var], T.tile(var, (1,)*ndim), mode=mode)
                topo = f.maker.fgraph.toposort()
                assert len(topo) == 1
                assert isinstance(topo[0].op, compile.DeepCopyOp)

--- a/theano/tensor/tests/test_subtensor.py
+++ b/theano/tensor/tests/test_subtensor.py
@@ -863,7 +863,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
            inc_slice(2, 1),
            (numpy.asarray([[0, 1], [2, 3], [4, 5.]]), numpy.asarray(9.),))
-    def test_advanced_inc_and_set(self):
+    def test_inc_and_set_subtensor(self):
+        """
+        Test increment and set with broadcast
+        """
+        X = tensor.matrix(dtype=self.dtype)
+        y = set_subtensor(X[1::, 1::],  0)
+        f = self.function([X], [y],
+                          op=self.inc_sub,
+                          N=1)
+        x_ = numpy.ones((9, 9))
+        out = f(x_.astype('float32'))
+        res = x_.copy()
+        res[1::, 1::] = 0
+        assert numpy.allclose(out, res)
+    def test_advanced1_inc_and_set(self):
        """
        Test advanced increment and set.
        """