Merge pull request #4039 from Sentient07/gpu_optimization

Making DimShuffles view by default

Merge pull request #4039 from Sentient07/gpu_optimization
8f0b0888 · Frédéric Bastien · GitHub · 8682627a · 3da3054e · 8f0b0888
--- a/doc/extending/graphstructures.txt
+++ b/doc/extending/graphstructures.txt
@@ -99,7 +99,7 @@ multiplication is done between the inputs:
 >>> y.owner.inputs[0]
 x
 >>> y.owner.inputs[1]
-DimShuffle{x,x}.0
+InplaceDimShuffle{x,x}.0

 Note that the second input is not 2 as we would have expected. This is
 because 2 was first :term:`broadcasted <broadcasting>` to a matrix of

--- a/doc/library/printing.txt
+++ b/doc/library/printing.txt
@@ -118,7 +118,7 @@ Elemwise{mul} [id A] ''
   |x [id E]
   |Elemwise{sub} [id I] ''
     |TensorConstant{2} [id F]
-     |DimShuffle{} [id J] ''
+     |InplaceDimShuffle{} [id J] ''
       |TensorConstant{1} [id K]

 >>> theano.printing.debugprint(gy, depth=2)  # doctest: +NORMALIZE_WHITESPACE

--- a/doc/library/tensor/basic.txt
+++ b/doc/library/tensor/basic.txt
@@ -615,13 +615,13 @@ dimensions, see :meth:`_tensor_py_operators.dimshuffle`.

    >>> tensor = theano.tensor.tensor3()
    >>> theano.tensor.shape_padaxis(tensor, axis=0)
-    DimShuffle{x,0,1,2}.0
+    InplaceDimShuffle{x,0,1,2}.0
    >>> theano.tensor.shape_padaxis(tensor, axis=1)
-    DimShuffle{0,x,1,2}.0
+    InplaceDimShuffle{0,x,1,2}.0
    >>> theano.tensor.shape_padaxis(tensor, axis=3)
-    DimShuffle{0,1,2,x}.0
+    InplaceDimShuffle{0,1,2,x}.0
    >>> theano.tensor.shape_padaxis(tensor, axis=-1)
-    DimShuffle{0,1,2,x}.0
+    InplaceDimShuffle{0,1,2,x}.0

 .. autofunction:: unbroadcast(x, *axes)


--- a/doc/tutorial/printing_drawing.txt
+++ b/doc/tutorial/printing_drawing.txt
@@ -69,10 +69,10 @@ The pre-compilation graph:
 >>> theano.printing.debugprint(prediction) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
 Elemwise{gt,no_inplace} [id A] ''
 |Elemwise{true_div,no_inplace} [id B] ''
- | |DimShuffle{x} [id C] ''
+ | |InplaceDimShuffle{x} [id C] ''
 | | |TensorConstant{1} [id D]
 | |Elemwise{add,no_inplace} [id E] ''
- |   |DimShuffle{x} [id F] ''
+ |   |InplaceDimShuffle{x} [id F] ''
 |   | |TensorConstant{1} [id D]
 |   |Elemwise{exp,no_inplace} [id G] ''
 |     |Elemwise{sub,no_inplace} [id H] ''
@@ -80,9 +80,9 @@ Elemwise{gt,no_inplace} [id A] ''
 |       | |dot [id J] ''
 |       |   |x [id K]
 |       |   |w [id L]
- |       |DimShuffle{x} [id M] ''
+ |       |InplaceDimShuffle{x} [id M] ''
 |         |b [id N]
- |DimShuffle{x} [id O] ''
+ |InplaceDimShuffle{x} [id O] ''
   |TensorConstant{0.5} [id P]

 The post-compilation graph:

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -121,14 +121,7 @@ class GpuFromHost(GpuOp):

    check_input = False

-    def __eq__(self, other):
-        return type(self) == type(other)
-
-    def __hash__(self):
-        return hash(type(self))
-
-    def __str__(self):
-        return 'GpuFromHost'
+    __props__ = ()

    def make_node(self, x):
        if not isinstance(x.type, tensor.TensorType):
@@ -220,12 +213,6 @@ class GpuElemwise(GpuOp):
        self.sync = d.get('sync', True)
        self._rehash()

-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.scalar_op == other.scalar_op and
-                self.inplace_pattern == other.inplace_pattern and
-                self.sync == other.sync)
-
    def _rehash(self):
        items = list(self.inplace_pattern.items())
        items.sort()
@@ -242,6 +229,12 @@ class GpuElemwise(GpuOp):
        assert h == getattr(self, '_hashval', h)
        self._hashval = h

+    def __eq__(self, other):
+        return (type(self) == type(other) and
+                self.scalar_op == other.scalar_op and
+                self.inplace_pattern == other.inplace_pattern and
+                self.sync == other.sync)
+
    def __hash__(self):
        return self._hashval

@@ -320,6 +313,8 @@ class GpuDimShuffle(GpuOp):

    check_broadcast = False

+    __props__ = ("input_broadcastable", "new_order")
+
    def __init__(self, input_broadcastable, new_order):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
@@ -342,17 +337,6 @@ class GpuDimShuffle(GpuOp):

        self.view_map = {0: [0]}

-        self._rehash()
-
-    def __getstate__(self):
-        d = dict(self.__dict__)
-        del d['_hashval']
-        return d
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self._rehash()
-
    def make_node(self, input):
        ib = tuple(input.type.broadcastable)
        if not ib == self.input_broadcastable:
@@ -379,21 +363,6 @@ class GpuDimShuffle(GpuOp):
                ob.append(ib[value])
        return Apply(self, [input], [CudaNdarrayType(broadcastable=ob)()])

-    def __eq__(self, other):
-        # it's probably not necessary to compare input_broadcastable
-        return type(self) == type(other) \
-            and self.new_order == other.new_order \
-            and self.input_broadcastable == other.input_broadcastable
-
-    def _rehash(self):
-        self._hashval = (hash(type(self).__name__) ^
-                         hash(type(self).__module__) ^
-                         hash(self.new_order) ^
-                         hash(self.input_broadcastable))
-
-    def __hash__(self):
-        return self._hashval
-
    def __str__(self):
        return "GpuDimShuffle{%s}" % ",".join(str(x) for x in self.new_order)

@@ -568,6 +537,8 @@ class GpuCAReduce(GpuOp):

    """

+    __props__ = ("reduce_mask", "scalar_op", "pre_scalar_op", )
+
    def __init__(self, reduce_mask, scalar_op, pre_scalar_op=None):
        self.reduce_mask = tuple(reduce_mask)
        self.scalar_op = scalar_op
@@ -578,17 +549,11 @@ class GpuCAReduce(GpuOp):
        if pre_scalar_op:
            assert pre_scalar_op.nin == 1

-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.reduce_mask == other.reduce_mask and
-                self.scalar_op == other.scalar_op and
-                self.pre_scalar_op == other.pre_scalar_op)
-
-    def __hash__(self):
-        return (hash(type(self)) ^
-                hash(self.reduce_mask) ^
-                hash(type(self.scalar_op)) ^
-                hash(type(self.pre_scalar_op)))
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+        # For unpickling of old ops.
+        if not hasattr(self, "pre_scalar_op"):
+            self.pre_scalar_op = None

    def __str__(self):
        pre = ""
@@ -598,12 +563,6 @@ class GpuCAReduce(GpuOp):
            pre, str(self.scalar_op),
            ','.join(str(i) for i in self.reduce_mask))

-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        # For unpickling of old ops.
-        if not hasattr(self, "pre_scalar_op"):
-            self.pre_scalar_op = None
-
    def make_node(self, x):
        x = as_cuda_ndarray_variable(x)
        if (x.type.ndim != len(self.reduce_mask)):
@@ -3655,7 +3614,6 @@ class GpuAllocEmpty(GpuOp):
    Implement Alloc on the gpu, but without initializing memory.

    """
-
    __props__ = ()

    def make_node(self, *shape):
@@ -3852,10 +3810,10 @@ class CopyOnNegativeStrides(GpuOp):
    If it does, returns a c contiguous copy.

    """
+    __props__ = ()

    view_map = {0: [0]}
    check_input = False
-    __props__ = ()

    def grad(self, inputs, dout):


--- a/theano/sandbox/cuda/blas.py
+++ b/theano/sandbox/cuda/blas.py
@@ -2207,18 +2207,12 @@ class GpuDownsampleFactorMax(GpuOp):
    Implement downsample with max on the gpu.

    """
+    __props__ = ('ds', 'ignore_border')
+
    def __init__(self, ds, ignore_border=False):
        self.ds = tuple(ds)
        self.ignore_border = ignore_border

-    def __eq__(self, other):
-        return (type(self) == type(other) and
-                self.ds == other.ds and
-                self.ignore_border == other.ignore_border)
-
-    def __hash__(self):
-        return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
-
    def __str__(self):
        return '%s{%s,%s}' % (self.__class__.__name__,
                              self.ds,

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -379,7 +379,7 @@ def local_gpu_split(node):
        if (input.owner and isinstance(input.owner.op, HostFromGpu) or
            any(c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
                in outs_clients)):
-            new_op = GpuSplit(node.op.len_splits)
+            new_op = GpuSplit(**node.op._props_dict())
            split_res = new_op(as_cuda_ndarray_variable(input),
                               *node.inputs[1:], return_list=True)
            return [host_from_gpu(o) for o in split_res]
@@ -398,16 +398,18 @@ def local_gpu_dimshuffle_0(node):
        input, = node.inputs
        if input.owner and isinstance(input.owner.op, HostFromGpu):
            # move the add to a GpuAdd
-            new_op = GpuDimShuffle(node.op.input_broadcastable,
-                                   node.op.new_order)
+            p_dict = node.op._props_dict()
+            p_dict.pop('inplace', None)
+            new_op = GpuDimShuffle(**p_dict)
            return [host_from_gpu(new_op(as_cuda_ndarray_variable(input)))]
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and isinstance(host_input.owner.op,
                                           tensor.DimShuffle):
            dimshuffle_node = host_input.owner
-            new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
-                                   dimshuffle_node.op.new_order)
+            p_dict = dimshuffle_node.op._props_dict()
+            p_dict.pop('inplace', None)
+            new_op = GpuDimShuffle(**p_dict)
            return [new_op(
                as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
    return False
@@ -995,10 +997,8 @@ def local_gpu_reshape(node):
        host_input = node.inputs[0]
        if host_input.owner and \
           isinstance(host_input.owner.op, tensor.Reshape):
-            rshp = host_input.owner.op
            x, shp = host_input.owner.inputs
-            gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x),
-                                                shp)
+            gpu_reshape = GpuReshape(**host_input.owner.op._props_dict())(as_cuda_ndarray_variable(x), shp)
            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
                # this can happen as we always return False for all broadcast
                # dim in GpuReshape but not for Reshape
@@ -1011,7 +1011,7 @@ def local_gpu_reshape(node):
        x, shp = node.inputs
        if x.owner and isinstance(x.owner.op, HostFromGpu):
            gpu_x, = x.owner.inputs
-            gpu_reshape = GpuReshape(node.op.ndim)(gpu_x, shp)
+            gpu_reshape = GpuReshape(**node.op._props_dict())(gpu_x, shp)
            if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
                # this can happen as we always return False for all broadcast
                # dim in GpuReshape but not for Reshape
@@ -1082,7 +1082,7 @@ def local_gpu_subtensor(node):
            gpu_x, = x.owner.inputs
            coords = node.inputs[1:]
            return [host_from_gpu(GpuSubtensor(
-                node.op.idx_list)(gpu_x, *coords))]
+                **node.op._props_dict())(gpu_x, *coords))]
    return False


@@ -1131,11 +1131,9 @@ def local_gpu_advanced_incsubtensor1(node):
            compute_capability = device_properties(active_device_no)['major']
            if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):

-                gpu_op = GpuAdvancedIncSubtensor1(
-                    set_instead_of_inc=set_instead_of_inc)
+                gpu_op = GpuAdvancedIncSubtensor1(**node.op._props_dict())
            else:
-                gpu_op = GpuAdvancedIncSubtensor1_dev20(
-                    set_instead_of_inc=set_instead_of_inc)
+                gpu_op = GpuAdvancedIncSubtensor1_dev20(**node.op._props_dict())
            return [gpu_op(as_cuda_ndarray_variable(x),
                           as_cuda_ndarray_variable(y), *coords)]

@@ -1171,11 +1169,9 @@ def local_gpu_advanced_incsubtensor1(node):
            active_device_no = theano.sandbox.cuda.active_device_number()
            compute_capability = device_properties(active_device_no)['major']
            if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
-                gpu_op = GpuAdvancedIncSubtensor1(
-                    set_instead_of_inc=set_instead_of_inc)
+                gpu_op = GpuAdvancedIncSubtensor1(**node.op._props_dict())
            else:
-                gpu_op = GpuAdvancedIncSubtensor1_dev20(
-                    set_instead_of_inc=set_instead_of_inc)
+                gpu_op = GpuAdvancedIncSubtensor1_dev20(**node.op._props_dict())
            return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
    return False

@@ -1196,13 +1192,9 @@ def local_gpu_incsubtensor(node):
                # The IncSubtensor upcast to float32 y, so we do it
                # explicitly to move it to the GPU.
                y = y.astype('float32')
-            ret = GpuIncSubtensor(
-                incsubt.idx_list,
-                inplace=incsubt.inplace,
-                set_instead_of_inc=incsubt.set_instead_of_inc)(
-                    as_cuda_ndarray_variable(x),
-                    as_cuda_ndarray_variable(y),
-                    *coords)
+            ret = GpuIncSubtensor(**incsubt._props_dict())(as_cuda_ndarray_variable(x),
+                                                           as_cuda_ndarray_variable(y),
+                                                           *coords)
            ret.tag.nan_guard_mode_check = getattr(
                host_output.tag, 'nan_guard_mode_check', True)
            return [ret]
@@ -1229,10 +1221,7 @@ def local_gpu_incsubtensor(node):
                y = tensor.cast(y, 'float32')
            gpu_y = as_cuda_ndarray_variable(y)
        if go_gpu:
-            ret = GpuIncSubtensor(
-                node.op.idx_list, inplace=node.op.inplace,
-                set_instead_of_inc=node.op.set_instead_of_inc)(
-                    gpu_x, gpu_y, *coords)
+            ret = GpuIncSubtensor(**node.op._props_dict())(gpu_x, gpu_y, *coords)

            val = getattr(node.outputs[0].tag, 'nan_guard_mode_check', True)
            ret.tag.nan_guard_mode_check = val
@@ -2690,7 +2679,7 @@ def gpu_sparse_block_outer_opt(node):

        inputs = _clear_host_from_gpu(node.inputs)

-        return [host_from_gpu(GpuSparseBlockOuter(node.op.inplace)(*inputs))]
+        return [host_from_gpu(GpuSparseBlockOuter()(*inputs))]

    elif isinstance(node.op, GpuFromHost) and \
            _owner_isinstance(node.inputs[0], SparseBlockOuter):
@@ -2698,7 +2687,7 @@ def gpu_sparse_block_outer_opt(node):
        meta_node = node.inputs[0].owner
        inputs = _clear_host_from_gpu(meta_node.inputs)

-        return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
+        return [GpuSparseBlockOuter()(*inputs)]


 @local_optimizer([GpuSparseBlockGemv], inplace=True)

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -3517,7 +3517,7 @@ def transpose(x, axes=None):
    """
    if axes is None:
        axes = list(range((x.ndim - 1), -1, -1))
-    ret = DimShuffle(x.broadcastable, axes, inplace=False)(x)
+    ret = DimShuffle(x.broadcastable, axes)(x)
    if x.name and axes == list(range((x.ndim - 1), -1, -1)):
        ret.name = x.name + '.T'
    return ret

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -73,8 +73,7 @@ class DimShuffle(Op):
        list can either be an index or 'x'. Indices must be encoded
        as python integers, not theano symbolic integers.
    inplace : bool, optional
-        If True, the output will be a view of the input.
-        If False (default), the output will be a copy of the input.
+        If True (default), the output will be a view of the input.

    Note
    ----
@@ -134,13 +133,17 @@ class DimShuffle(Op):

    _f16_ok = True
    check_input = False
+    __props__ = ("input_broadcastable", "new_order", "inplace")

-    def __init__(self, input_broadcastable, new_order, inplace=False):
+    def __init__(self, input_broadcastable, new_order, inplace=True):
        input_broadcastable = tuple(input_broadcastable)
        self.input_broadcastable = input_broadcastable
        new_order = tuple(new_order)
        self.new_order = new_order
-        self.inplace = inplace
+        if inplace is True:
+            self.inplace = inplace
+        else:
+            raise ValueError("DimShuffle is inplace by default and hence the inplace for DimShuffle must be true")

        for i, j in enumerate(new_order):
            if j != 'x':
@@ -186,17 +189,6 @@ class DimShuffle(Op):
        if self.inplace:
            self.view_map = {0: [0]}

-        self._rehash()
-
-    def __getstate__(self):
-        d = dict(self.__dict__)
-        del d['_hashval']
-        return d
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self._rehash()
-
    def make_node(self, _input):
        input = as_tensor_variable(_input)
        ib = tuple(input.type.broadcastable)
@@ -227,23 +219,6 @@ class DimShuffle(Op):

        return Apply(self, [input], [output])

-    def __eq__(self, other):
-        # it's probably not necessary to compare input_broadcastable
-        return type(self) == type(other) \
-            and self.inplace == other.inplace \
-            and self.new_order == other.new_order \
-            and self.input_broadcastable == other.input_broadcastable
-
-    def _rehash(self):
-        self._hashval = (hash(type(self).__name__) ^
-                         hash(type(self).__module__) ^
-                         hash(self.inplace) ^
-                         hash(self.new_order) ^
-                         hash(self.input_broadcastable))
-
-    def __hash__(self):
-        return self._hashval
-
    def __str__(self):
        if self.inplace:
            return "InplaceDimShuffle{%s}" % ",".join(str(x)
@@ -564,8 +539,7 @@ second dimension
                # TODO: use LComplete instead
                args.append(dim_shuffle(
                    input.type.broadcastable,
-                    ['x'] * difference + list(range(length)),
-                    inplace=False)(input))
+                    ['x'] * difference + list(range(length)))(input))
        inputs = args

        # HERE: all the broadcast dims have the same length now
@@ -798,7 +772,8 @@ second dimension
                # dimensions
                res = theano.tensor.constant(numpy.asarray(r.data),
                                             dtype=r.type.dtype)
-                return DimShuffle((), ['x'] * nd, inplace=False)(res)
+                return DimShuffle((), ['x'] * nd)(res)
+
            new_r = Elemwise(node.op, {})(
                *[transform(ipt) for ipt in node.inputs])
            return new_r

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -600,8 +600,7 @@ def local_dimshuffle_lift(node):
        new_inputs = []
        for inp in inode.inputs:
            new_inp = op.__class__(inp.type.broadcastable,
-                                   op.new_order,
-                                   op.inplace)(inp)
+                                   op.new_order)(inp)
            new_inputs.append(apply_local_dimshuffle_lift(new_inp))
        copy_stack_trace(node.outputs[0], new_inputs)
        ret = inode.op(*new_inputs, **dict(return_list=True))
@@ -609,14 +608,12 @@ def local_dimshuffle_lift(node):
    if inode and isinstance(inode.op, DimShuffle):
        new_order = [x == 'x' and 'x' or inode.op.new_order[x] for x in
                     new_order]
-        inplace = op.inplace and inode.op.inplace
        input = inode.inputs[0]

    if is_dimshuffle_useless(new_order, input):
        return [input]
    elif inode and isinstance(inode.op, DimShuffle):
-        ret = op.__class__(input.type.broadcastable, new_order,
-                           inplace)(input)
+        ret = op.__class__(input.type.broadcastable, new_order)(input)
        ret = apply_local_dimshuffle_lift(ret)
        copy_stack_trace(node.outputs[0], ret)
        return [ret]
@@ -659,7 +656,7 @@ def local_useless_dimshuffle_in_reshape(node):


 @register_canonicalize
-@gof.local_optimizer([T.DimShuffle])
+@gof.local_optimizer([DimShuffle])
 def local_lift_transpose_through_dot(node):
    """
    dot(x,y).T -> dot(y.T, x.T)
@@ -688,40 +685,14 @@ def local_lift_transpose_through_dot(node):
        copy_stack_trace(node.inputs[0], ret)
        return ret

-
-@gof.local_optimizer([DimShuffle])
-def dimshuffle_as_view(node):
-    op = node.op
-    if not isinstance(op, DimShuffle) or op.inplace:
-        return False
-    new_op = op.__class__(op.input_broadcastable, op.new_order, inplace=True)
-    v = new_op(*node.inputs)
-    copy_stack_trace(node.outputs[0], v)
-    return [v]
-
-# Step 60 is the inplace optimization stage.
-compile.optdb.register('dimshuffle_as_view',
-                       TopoOptimizer(
-                           dimshuffle_as_view,
-                           failure_callback=TopoOptimizer.warn_inplace),
-                       60,
-                       'fast_run', 'inplace')
 register_canonicalize(local_dimshuffle_lift)
 register_specialize(local_dimshuffle_lift)

-
-@register_canonicalize
-@gof.local_optimizer([T.DimShuffle])
-def local_dimshuffle_no_inplace_at_canonicalize(node):
-    if isinstance(node.op, T.DimShuffle) and node.op.inplace:
-        return [T.DimShuffle(node.op.input_broadcastable,
-                             node.op.new_order, inplace=False)(node.inputs[0])]
-
-
 ######################
 # Casting operations #
 ######################

+
 @register_canonicalize
 @register_specialize
 @gof.local_optimizer([T.TensorFromScalar])

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -117,7 +117,7 @@ class test_dimshuffle_lift(unittest.TestCase):
        x, y, z = inputs()
        e = ds(ds(x, (1, 0)), (1, 0))
        g = FunctionGraph([x], [e])
-        self.assertTrue(str(g) == "[DimShuffle{1,0}(DimShuffle{1,0}(x))]")
+        self.assertTrue(str(g) == "[InplaceDimShuffle{1,0}(InplaceDimShuffle{1,0}(x))]")
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) == "[x]")
        # no need to check_stack_trace as graph is supposed to be empty
@@ -126,11 +126,10 @@ class test_dimshuffle_lift(unittest.TestCase):
        x, y, z = inputs()
        e = ds(ds(x, (1, 'x', 0)), (2, 0, 'x', 1))
        g = FunctionGraph([x], [e])
-        self.assertTrue(
-            str(g) == "[DimShuffle{2,0,x,1}(DimShuffle{1,x,0}(x))]",
-            str(g))
+        self.assertTrue(str(g) == "[InplaceDimShuffle{2,0,x,1}(InplaceDimShuffle{1,x,0}(x))]",
+                        str(g))
        dimshuffle_lift.optimize(g)
-        self.assertTrue(str(g) == "[DimShuffle{0,1,x,x}(x)]", str(g))
+        self.assertTrue(str(g) == "[InplaceDimShuffle{0,1,x,x}(x)]", str(g))
        # Check stacktrace was copied over correctly after opt was applied
        self.assertTrue(check_stack_trace(g, ops_to_check='all'))

@@ -138,10 +137,9 @@ class test_dimshuffle_lift(unittest.TestCase):
        x, y, z = inputs()
        e = ds(ds(ds(x, (0, 'x', 1)), (2, 0, 'x', 1)), (1, 0))
        g = FunctionGraph([x], [e])
-        self.assertTrue(
-            str(g) == "[DimShuffle{1,0}(DimShuffle{2,0,x,1}"
-                      "(DimShuffle{0,x,1}(x)))]",
-            str(g))
+        self.assertTrue(str(g) == "[InplaceDimShuffle{1,0}(InplaceDimShuffle{2,0,x,1}"
+                          "(InplaceDimShuffle{0,x,1}(x)))]",
+                        str(g))
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) == "[x]", str(g))
        # no need to check_stack_trace as graph is supposed to be empty
@@ -179,24 +177,22 @@ class test_dimshuffle_lift(unittest.TestCase):
        m = T.matrix(dtype="float64")
        out = ((v + 42) * (m + 84)).T
        g = FunctionGraph([v, m], [out])
-        init_str_g = ("[DimShuffle{1,0}(Elemwise{mul,no_inplace}"
-                      "(DimShuffle{x,0}(Elemwise{add,no_inplace}"
+        init_str_g = ("[InplaceDimShuffle{1,0}(Elemwise{mul,no_inplace}"
+                      "(InplaceDimShuffle{x,0}(Elemwise{add,no_inplace}"
                      "(<TensorType(float64, vector)>, "
-                      "DimShuffle{x}(TensorConstant{42}))), "
+                      "InplaceDimShuffle{x}(TensorConstant{42}))), "
                      "Elemwise{add,no_inplace}"
                      "(<TensorType(float64, matrix)>, "
-                      "DimShuffle{x,x}(TensorConstant{84}))))]")
-
+                      "InplaceDimShuffle{x,x}(TensorConstant{84}))))]")
        self.assertTrue(str(g) == init_str_g)
        new_out = local_dimshuffle_lift.transform(g.outputs[0].owner)[0]
        new_g = FunctionGraph(g.inputs, [new_out])
        opt_str_g = ("[Elemwise{mul,no_inplace}(Elemwise{add,no_inplace}"
-                     "(DimShuffle{0,x}(<TensorType(float64, vector)>), "
-                     "DimShuffle{x,x}(TensorConstant{42})), "
-                     "Elemwise{add,no_inplace}(DimShuffle{1,0}"
+                     "(InplaceDimShuffle{0,x}(<TensorType(float64, vector)>), "
+                     "InplaceDimShuffle{x,x}(TensorConstant{42})), "
+                     "Elemwise{add,no_inplace}(InplaceDimShuffle{1,0}"
                     "(<TensorType(float64, matrix)>), "
-                     "DimShuffle{x,x}(TensorConstant{84})))]")
-
+                     "InplaceDimShuffle{x,x}(TensorConstant{84})))]")
        self.assertTrue(str(new_g) == opt_str_g)
        # Check stacktrace was copied over correctly after opt was applied
        self.assertTrue(check_stack_trace(new_g, ops_to_check='all'))
@@ -205,7 +201,7 @@ class test_dimshuffle_lift(unittest.TestCase):
        x, _, _ = inputs()
        e = ds(x, (0, 1))
        g = FunctionGraph([x], [e])
-        self.assertTrue(str(g) == "[DimShuffle{0,1}(x)]")
+        self.assertTrue(str(g) == "[InplaceDimShuffle{0,1}(x)]")
        dimshuffle_lift.optimize(g)
        self.assertTrue(str(g) == "[x]")
        # Check stacktrace was copied over correctly after opt was applied
@@ -219,9 +215,9 @@ class test_dimshuffle_lift(unittest.TestCase):
        ds_z = ds(z, (2, 1, 0))  # usefull
        ds_u = ds(u, ('x'))  # usefull
        g = FunctionGraph([x, y, z, u], [ds_x, ds_y, ds_z, ds_u])
-        self.assertTrue(str(g) == "[DimShuffle{0,x}(x), DimShuffle{2,1,0}(y), DimShuffle{2,1,0}(z), DimShuffle{x}(TensorConstant{1})]")
+        self.assertTrue(str(g) == "[InplaceDimShuffle{0,x}(x), InplaceDimShuffle{2,1,0}(y), InplaceDimShuffle{2,1,0}(z), InplaceDimShuffle{x}(TensorConstant{1})]")
        dimshuffle_lift.optimize(g)
-        self.assertTrue(str(g) == "[x, y, DimShuffle{2,1,0}(z), DimShuffle{x}(TensorConstant{1})]")
+        self.assertTrue(str(g) == "[x, y, InplaceDimShuffle{2,1,0}(z), InplaceDimShuffle{x}(TensorConstant{1})]")
        # Check stacktrace was copied over correctly after opt was applied
        self.assertTrue(hasattr(g.outputs[0].tag, 'trace'))

@@ -241,10 +237,11 @@ def test_local_useless_dimshuffle_in_reshape():
                      [reshape_dimshuffle_vector, reshape_dimshuffle_mat,
                       reshape_dimshuffle_row, reshape_dimshuffle_col])

-    assert_true(str(g) == "[Reshape{1}(DimShuffle{x,0}(vector), Shape(vector)), "
-                          "Reshape{2}(DimShuffle{x,0,x,1}(mat), Shape(mat)), "
-                          "Reshape{2}(DimShuffle{1,x}(row), Shape(row)), "
-                          "Reshape{2}(DimShuffle{0}(col), Shape(col))]")
+    print(str(g))
+    assert_true(str(g) == "[Reshape{1}(InplaceDimShuffle{x,0}(vector), Shape(vector)), "
+                          "Reshape{2}(InplaceDimShuffle{x,0,x,1}(mat), Shape(mat)), "
+                          "Reshape{2}(InplaceDimShuffle{1,x}(row), Shape(row)), "
+                          "Reshape{2}(InplaceDimShuffle{0}(col), Shape(col))]")
    useless_dimshuffle_in_reshape = out2in(local_useless_dimshuffle_in_reshape)
    useless_dimshuffle_in_reshape.optimize(g)
    assert_true(str(g) == "[Reshape{1}(vector, Shape(vector)), "
@@ -3766,15 +3763,15 @@ class Test_local_canonicalize_alloc(unittest.TestCase):
                                   "TensorConstant{2})]"))

        alloc_lift.optimize(g)
-        self.assertTrue(str(g) == "[DimShuffle{x,0,1}"
+        self.assertTrue(str(g) == "[InplaceDimShuffle{x,0,1}"
                                  "(Alloc(<TensorType(float64, vector)>, "
                                  "TensorConstant{3}, "
                                  "TensorConstant{2})), "

-                                  "DimShuffle{x,x}"
+                                  "InplaceDimShuffle{x,x}"
                                  "(<TensorType(float64, scalar)>), "

-                                  "DimShuffle{x,0,1}"
+                                  "InplaceDimShuffle{x,0,1}"
                                  "(Alloc(<TensorType(float64, matrix)>, "
                                  "TensorConstant{1}, "
                                  "TensorConstant{2})), "
@@ -6268,9 +6265,9 @@ class Test_local_reshape_to_dimshuffle(unittest.TestCase):

        reshape_lift.optimize(g)
        useless_reshape.optimize(g)
-        self.assertTrue(str(g) == "[DimShuffle{x,0}"
+        self.assertTrue(str(g) == "[InplaceDimShuffle{x,0}"
                                  "(<TensorType(float64, vector)>), "
-                                  "DimShuffle{x,0,x,1,x,x}"
+                                  "InplaceDimShuffle{x,0,x,1,x,x}"
                                  "(Reshape{2}(<TensorType(float64, matrix)>, "
                                  "TensorConstant{[5 6]}))]")

@@ -6301,7 +6298,7 @@ class Test_lift_transpose_through_dot(unittest.TestCase):
    def test_matrix_matrix(self):
        a, b = matrices('ab')
        g = self.simple_optimize(FunctionGraph([a, b], [tensor.dot(a, b).T]))
-        sg = '[dot(DimShuffle{1,0}(b), DimShuffle{1,0}(a))]'
+        sg = '[dot(InplaceDimShuffle{1,0}(b), InplaceDimShuffle{1,0}(a))]'
        assert str(g) == sg, (str(g), sg)
        # Check stacktrace was copied over correctly after opt was applied
        self.assertTrue(check_stack_trace(g, ops_to_check='all'))
@@ -6313,7 +6310,7 @@ class Test_lift_transpose_through_dot(unittest.TestCase):
            [a, b],
            [tensor.dot(a.dimshuffle('x', 0), b).T]),
            level='stabilize')
-        sg = '[dot(DimShuffle{1,0}(b), DimShuffle{0,x}(a))]'
+        sg = '[dot(InplaceDimShuffle{1,0}(b), InplaceDimShuffle{0,x}(a))]'
        assert str(g) == sg, (str(g), sg)
        # Check stacktrace was copied over correctly after opt was applied
        self.assertTrue(check_stack_trace(g, ops_to_check='all'))
@@ -6325,7 +6322,7 @@ class Test_lift_transpose_through_dot(unittest.TestCase):
            [a, b],
            [tensor.dot(b, a.dimshuffle(0, 'x')).T]),
            level='stabilize')
-        sg = '[dot(DimShuffle{x,0}(a), DimShuffle{1,0}(b))]'
+        sg = '[dot(InplaceDimShuffle{x,0}(a), InplaceDimShuffle{1,0}(b))]'
        assert str(g) == sg, (str(g), sg)
        # Check stacktrace was copied over correctly after opt was applied
        self.assertTrue(check_stack_trace(g, ops_to_check='all'))

--- a/theano/tests/test_printing.py
+++ b/theano/tests/test_printing.py
@@ -321,10 +321,10 @@ def test_scan_debugprint1():
     | | | | | |Subtensor{int64} [id H] ''
     | | | | |   |Shape [id I] ''
     | | | | |   | |Rebroadcast{0} [id J] ''
-     | | | | |   |   |DimShuffle{x,0} [id K] ''
+     | | | | |   |   |InplaceDimShuffle{x,0} [id K] ''
     | | | | |   |     |Elemwise{second,no_inplace} [id L] ''
     | | | | |   |       |A [id M]
-     | | | | |   |       |DimShuffle{x} [id N] ''
+     | | | | |   |       |InplaceDimShuffle{x} [id N] ''
     | | | | |   |         |TensorConstant{1.0} [id O]
     | | | | |   |Constant{0} [id P]
     | | | | |Subtensor{int64} [id Q] ''
@@ -490,7 +490,7 @@ def test_scan_debugprint3():

    for{cpu,scan_fn} [id B] ''
     >Elemwise{mul,no_inplace} [id Y] ''
-     > |DimShuffle{x} [id Z] ''
+     > |InplaceDimShuffle{x} [id Z] ''
     > | |coefficients[t] [id BA] -> [id S]
     > |Elemwise{pow,no_inplace} [id BB] ''
     >   |Subtensor{int64} [id BC] ''
@@ -504,10 +504,10 @@ def test_scan_debugprint3():
     >   | | | | | | |Subtensor{int64} [id BJ] ''
     >   | | | | | |   |Shape [id BK] ''
     >   | | | | | |   | |Rebroadcast{0} [id BL] ''
-     >   | | | | | |   |   |DimShuffle{x,0} [id BM] ''
+     >   | | | | | |   |   |InplaceDimShuffle{x,0} [id BM] ''
     >   | | | | | |   |     |Elemwise{second,no_inplace} [id BN] ''
     >   | | | | | |   |       |A_copy [id BO] -> [id W]
-     >   | | | | | |   |       |DimShuffle{x} [id BP] ''
+     >   | | | | | |   |       |InplaceDimShuffle{x} [id BP] ''
     >   | | | | | |   |         |TensorConstant{1.0} [id BQ]
     >   | | | | | |   |Constant{0} [id BR]
     >   | | | | | |Subtensor{int64} [id BS] ''
@@ -520,7 +520,7 @@ def test_scan_debugprint3():
     >   | | | |A_copy [id BO] -> [id W]
     >   | | |Constant{1} [id BW]
     >   | |Constant{-1} [id BX]
-     >   |DimShuffle{x} [id BY] ''
+     >   |InplaceDimShuffle{x} [id BY] ''
     >     |<TensorType(int64, scalar)> [id BZ] -> [id U]

    for{cpu,scan_fn} [id BE] ''
@@ -636,10 +636,10 @@ def test_scan_debugprint5():
    | | | |   | | | |Subtensor{int64} [id K] ''
    | | | |   | | |   |Shape [id L] ''
    | | | |   | | |   | |Rebroadcast{0} [id M] ''
-    | | | |   | | |   |   |DimShuffle{x,0} [id N] ''
+    | | | |   | | |   |   |InplaceDimShuffle{x,0} [id N] ''
    | | | |   | | |   |     |Elemwise{second,no_inplace} [id O] ''
    | | | |   | | |   |       |A [id P]
-    | | | |   | | |   |       |DimShuffle{x} [id Q] ''
+    | | | |   | | |   |       |InplaceDimShuffle{x} [id Q] ''
    | | | |   | | |   |         |TensorConstant{1.0} [id R]
    | | | |   | | |   |Constant{0} [id S]
    | | | |   | | |Subtensor{int64} [id T] ''
@@ -675,20 +675,20 @@ def test_scan_debugprint5():
    | | | | | |k [id G]
    | | | | | |IncSubtensor{Set;:int64:} [id H] ''
    | | | | | |A [id P]
-    | | | | |DimShuffle{x,x} [id BP] ''
+    | | | | |InplaceDimShuffle{x,x} [id BP] ''
    | | | |   |TensorConstant{0.0} [id BQ]
    | | | |IncSubtensor{Inc;int64} [id BR] ''
    | | | | |Elemwise{second,no_inplace} [id BS] ''
    | | | | | |Subtensor{int64::} [id BT] ''
    | | | | | | |for{cpu,scan_fn} [id BO] ''
    | | | | | | |Constant{1} [id BU]
-    | | | | | |DimShuffle{x,x} [id BV] ''
+    | | | | | |InplaceDimShuffle{x,x} [id BV] ''
    | | | | |   |TensorConstant{0.0} [id BQ]
    | | | | |Elemwise{second} [id BW] ''
    | | | | | |Subtensor{int64} [id BX] ''
    | | | | | | |Subtensor{int64::} [id BT] ''
    | | | | | | |Constant{-1} [id BY]
-    | | | | | |DimShuffle{x} [id BZ] ''
+    | | | | | |InplaceDimShuffle{x} [id BZ] ''
    | | | | |   |Elemwise{second,no_inplace} [id CA] ''
    | | | | |     |Sum{acc_dtype=float64} [id CB] ''
    | | | | |     | |Subtensor{int64} [id BX] ''