Moved most of ShapeOptimizer -> ShapeFeature.

An Env feature persists throughout the life of the env, so many optimizations can take advantage of the Shape analysis done by ShapeFeature.

Moved most of ShapeOptimizer -> ShapeFeature.
a1a2a394 · James Bergstra · 371e781f · a1a2a394
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -44,6 +44,22 @@ def _fill_chain(new_out, orig_inputs):
        new_out = T.fill(i, new_out)
    return [new_out]
+def encompasses_broadcastable(b1, b2):
+    """
+    Returns True if the broadcastable patterns b1 and b2 are such that b2 is
+    broadcasted to b1's shape and not the opposite.
+    :param b1: the broadcastable attribute of a tensor type
+    :param b2: the broadcastable attribute of a tensor type
+    """
+    if len(b1) < len(b2):
+        return False
+    b1 = b1[-len(b2):]
+    return not any(v1 and not v2 for v1, v2 in zip(b1, b2))
+def merge_broadcastables(broadcastables):
+    return [all(bcast) for bcast in zip(*broadcastables)]
 def get_constant_value(v):
    """return the constant scalar(0-D) value underlying variable `v`
@@ -184,25 +200,36 @@ register_canonicalize(local_dimshuffle_lift)
-#################
+#####################################
-# Shape lifters #
+# ShapeFeature, Shape optimizations
-#################
+#####################################
 class MakeVector(T.Op):
    """Concatenate a number of scalars together into a vector
    This is a simple version of stack() that introduces far less cruft into the graph.
    """
+    def __init__(self, dtype='int64'):
+        self.dtype = dtype
+    def __eq__(self, other):
+        return type(self) == type(other) and self.dtype == other.dtype
+    def __hash__(self):
+        return hash(type(self)) ^ hash(self.dtype)
    def make_node(self, *inputs):
        inputs = map(T.as_tensor_variable, inputs)
        if not all(a.type == inputs[0].type for a in inputs):
            raise TypeError('This MakeVector instance requires inputs of same type %s' %
                    inputs[0].type)
+        if inputs:
+            dtype = inputs[0].type.dtype
+        else:
+            dtype = self.dtype
        #bcastable = (len(inputs) == 1)
        bcastable = False
        otype = T.TensorType(
                broadcastable=(bcastable,),
-                dtype=inputs[0].type.dtype)
+                dtype=dtype)
        return T.Apply(self, inputs, [otype()])
    def __str__(self):
        return self.__class__.__name__
@@ -241,29 +268,20 @@ class Shape_i(T.Op):
            raise TypeError('x has too few dimensions for Shape_i', (x, self.i))
        return T.Apply(self, [x], [T.lscalar()])
    def perform(self, node, (x, ), (out, )):
-        out[0] = theano._asarray(x.shape, dtype = 'int64')[self.i]
+        out[0] = theano._asarray(x.shape[self.i], dtype = 'int64')
    def grad(self, (x,), (gz,)):
        return [None]
-lscalar_one = T.constant(1, dtype='int64')
+class ShapeFeature(object):
-assert lscalar_one.type == T.lscalar
-def shape_i(i):
-    def op_deco(r):
-        if r.type.broadcastable[i]:
-            return lscalar_one
-        else:
-            return Shape_i(i)(r)
-    return op_deco
-class ShapeOptimizer(Optimizer):
    """Graph optimizer for removing all calls to shape()
    This optimizer replaces all Shapes and Subtensors of Shapes with Shape_i and MakeVector
    Ops.
-    This optimizer has two goals:
+    This optimizer has several goals:
    1. to 'lift' Shapes to as close to the inputs as possible.  
    2. to infer the shape of every node in the graph in terms of the input shapes.
+    3. remove all fills (T.second, T.fill) from the graph
    Lifting shapes as close to the inputs as possible is important for canonicalization because
    it is very bad form to have to compute something just to know how big it will be.  Firstly,
@@ -302,24 +320,32 @@ class ShapeOptimizer(Optimizer):
    """
-    def __init__(self):
+    def shape_i(self, i):
-        Optimizer.__init__(self)
+        def op_deco(r):
+            if r.type.broadcastable[i]:
-    def add_requirements(self, env):
+                return self.lscalar_one
-        env.extend(toolbox.ReplaceValidate())
+            else:
+                return Shape_i(i)(r)
+        return op_deco
-    def apply(self, env):
+    def shape_tuple(self, r):
-        shape_of = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+        return tuple([self.shape_i(i)(r) for i in xrange(r.ndim)])
-        def new_shape_from_r(r):
+    def default_infer_shape(self, node, i_shapes):
-            return tuple([shape_i(i)(r) for i in xrange(r.ndim)])
+        rval = []
+        for r in node.outputs:
+            try:
+                rval.append(self.shape_tuple(r))
+            except AttributeError:
+                rval.append(None)
+        return rval
-        def unpack(s_i):
+    def unpack(self, s_i):
        # unpack the s_i that the Op returned
        assert s_i is not None
        if s_i == 1:
            # don't make the optimizer merge a zillion ones together
-                return lscalar_one
+            return self.lscalar_one
        if type(s_i) is int:
            # this shape is a constant
            assert s_i >= 0
@@ -337,46 +363,56 @@ class ShapeOptimizer(Optimizer):
        else:
            raise TypeError('Unsupported shape element', s_i)
-        def set_shape(r, s):
+    def set_shape(self, r, s):
-            assert r not in shape_of
+        assert r not in self.shape_of
        if s is None:
-                shape_of[r] = s
+            self.shape_of[r] = s
        else:
-                shape_of[r] = tuple([unpack(s_i) for s_i in s])
+            self.shape_of[r] = tuple([self.unpack(s_i) for s_i in s])
-        def default_infer_shape(node, i_shapes):
+    def make_vector_shape(self, r):
-            rval = []
+        return make_vector(*self.shape_of[r])
-            for r in node.outputs:
+    #
-                try:
+    #
-                    rval.append(new_shape_from_r(r))
+    # Feature inteface
-                except AttributeError:
+    #
-                    rval.append(None)
+    #
-            return rval
+    def on_attach(self, env):
+        assert not hasattr(env, 'shape_feature')
+        env.shape_feature = self
+        self.shape_of = {} # Variable -> tuple(scalars) or None  (All tensor vars map to tuple)
+        self.lscalar_one = T.constant(1, dtype='int64')
+        assert self.lscalar_one.type == T.lscalar
+        for node in env.toposort():
+            self.on_import(env, node)
+    def on_import(self, env, node):
+        if node.outputs[0] in self.shape_of:
+            # this is a revert, not really an import
+            for r in node.outputs + node.inputs:
+                assert r in self.shape_of
+            return
-        # Do a feed-forward shape-inference pass through the entire graph
-        # This builds the shape_of dictionary.
-        nodelist = list(env.toposort())
-        for node in nodelist:
        for i, r in enumerate(node.inputs):
            # make sure we have shapes for the inputs
-                if r not in shape_of:
+            if r not in self.shape_of:
                try:
-                        set_shape(r, new_shape_from_r(r))
+                    self.set_shape(r, self.shape_tuple(r))
                except AttributeError:
-                        set_shape(r, None ) # not a TensorType variable
+                    self.set_shape(r, None ) # not a TensorType variable
        try:
            shape_infer = node.op.infer_shape
        except AttributeError:
-                shape_infer = default_infer_shape
+            shape_infer = self.default_infer_shape
        try:
-                o_shapes = shape_infer(node, [shape_of[r] for r in node.inputs])
+            o_shapes = shape_infer(node, [self.shape_of[r] for r in node.inputs])
        except Exception, e:
            _logger.error('Failed to infer_shape from Op %s (i_shapes=%s): %s %s'% (node.op,
-                    [shape_of[r] for r in node.inputs],
+                [self.shape_of[r] for r in node.inputs],
                type(e), str(e)))
-                o_shapes = default_infer_shape(node, [shape_of[r] for r in node.inputs])
+            o_shapes = default_infer_shape(node, [self.shape_of[r] for r in node.inputs])
        # this is packed information
        # an element of o_shapes is either None or a tuple
@@ -385,117 +421,100 @@ class ShapeOptimizer(Optimizer):
        assert len(o_shapes) == len(node.outputs)
        for r, s in zip(node.outputs, o_shapes):
-                set_shape(r, s)
+            self.set_shape(r, s)
-        # replace all shape -> make_vector
-        shapes_in_graph = True
-        while shapes_in_graph:
-            shapes_in_graph = False
-            # we do this multiple times because
-            # some of the shape_of expressions might be
-            # expressed in terms of shape
-            nodelist = list(env.toposort())
-            for node in nodelist:
-                if node.op == T._shape:
-                    shapes_in_graph = True
-                    env.replace_validate(node.outputs[0],
-                            make_vector(*shape_of[node.inputs[0]]),
-                            reason='ShapeOptimizer [phase 1]')
+    def on_change_input(self, env, mode, i, r, new_r):
+        # TODO:
+        # This tells us that r and new_r must have the same shape
+        # if we didn't know that the shapes are related, now we do.
+        pass
-        # replace all subtensor(make_vector) like:
+class ShapeOptimizer(Optimizer):
-        # [a,b,c][0] -> a
+    """Optimizer that serves to add ShapeFeature as an env feature.
-        # [a,b,c][0:2] -> [a,b]
+    """
-        # we can do this for constant indexes
+    def __init__(self):
-        nodelist = list(env.toposort())
+        Optimizer.__init__(self)
-        for node in nodelist:
-            if isinstance(node.op, T.Subtensor):
-                x = node.inputs[0]
-                if x.owner and x.owner.op == make_vector:
-                    idxlist = node.op.idx_list
-                    if len(idxlist) != 1:
+    def add_requirements(self, env):
-                        continue
+        env.extend(ShapeFeature())
-                    idx = idxlist[0]
+    def apply(self, env):
-                    if isinstance(idx, int):
+        pass
-                        env.replace_validate(node.outputs[0],
-                                x.owner.inputs[idx],
-                                reason='ShapeOptimizer [phase 2 a]')
-                    else:
-                        env.replace_validate(node.outputs[0],
-                                make_vector(*x.owner.inputs.__getslice__(idx)),
-                                reason='ShapeOptimizer [phase 2 b]')
 # -1 should make it run right before the first merge
 theano.compile.mode.optdb.register('ShapeOpt', ShapeOptimizer(), -1, 'fast_run', 'fast_compile')
-################
+@register_specialize
-# Fill lifters #
+@register_canonicalize
-################
+@gof.local_optimizer([T.fill])
+def local_fill_to_alloc(node):
-def encompasses_broadcastable(b1, b2):
+    """fill(s,v) -> alloc(v, shape(s))
-    """
-    Returns True if the broadcastable patterns b1 and b2 are such that b2 is
-    broadcasted to b1's shape and not the opposite.
-    :param b1: the broadcastable attribute of a tensor type
-    :param b2: the broadcastable attribute of a tensor type
-    """
-    if len(b1) < len(b2):
-        return False
-    b1 = b1[-len(b2):]
-    return not any(v1 and not v2 for v1, v2 in zip(b1, b2))
-def merge_broadcastables(broadcastables):
-    return [all(bcast) for bcast in zip(*broadcastables)]
-@gof.local_optimizer([T.fill, None])
+    This is an important optimization because with the shape_to_shape_i optimization, the
-def local_fill_lift(node):
+    dependency on 's' is often removed.
-    """
-    fill(f(a), b) -> fill(a, b)
-    If a.type == f(a).type.
-    fill(a, b) -> b
-    If a.type == b.type.
    """
-    if not opt.check_chain(node, T.fill):
+    if node.op == T.fill:
-        return False
+        r, v = node.inputs
+        if v.type == node.outputs[0].type:
-    model, filling = node.inputs
+            # this is a useless fill, erase it.
+            rval = [v]
-    mb, fb = model.type.broadcastable, filling.type.broadcastable
+        elif v.type.broadcastable == node.outputs[0].type.broadcastable:
-    if model.type.dtype == filling.type.dtype and encompasses_broadcastable(fb, mb):
+            # this is a cast
-        return False# [filling]
+            rval = [T.cast(v, node.outputs[0].type.dtype)]
+        else:
-    parent = model.owner
+            # we are broadcasting v somehow
-    if parent is None or not isinstance(parent, T.Elemwise):
+            shape_of = node.env.shape_feature.shape_of
-        return False
+            # TODO: cut out un-necessary dimshuffles of v
-    for input in parent.inputs:
+            rval = [T.Alloc(node.outputs[0].dtype)(v, *shape_of[node.outputs[0]])]
-        if input.type == model.type:
+        assert rval[0].type == node.outputs[0].type
-            return [T.fill(input, filling)]
+        return rval
-    return False
-register_canonicalize(local_fill_lift, 'fill_lift')
-register_specialize(local_fill_lift, 'fill_lift')
 @register_specialize
 @register_canonicalize
-@gof.local_optimizer([T.fill])
+@gof.local_optimizer([T._shape])
-def local_fill_useless(node):
+def local_shape_to_shape_i(node):
-    """fill(y,x) -> x 
+    if node.op == T._shape:
+        shape_feature = node.env.shape_feature
+        return [shape_feature.make_vector_shape(node.inputs[0])]
-    This is legal when the output of fill has the same type as x,
+@register_specialize
-    because it means that y isn't contributing anything.
+@register_canonicalize
-    """
+@gof.local_optimizer([T.Subtensor])
-    if node.op == T.fill:
+def local_subtensor_make_vector(node):
-        shape, val = node.inputs
+    # replace all subtensor(make_vector) like:
-        output, = node.outputs
+    # [a,b,c][0] -> a
-        if output.type == val.type:
+    # [a,b,c][0:2] -> [a,b]
-            # if shape is not being used to broadcast
+    # we can do this for constant indexes
-            # then we can ignore it.
+    if isinstance(node.op, T.Subtensor):
-            return [val]
+        shape_feature = node.env.shape_feature
+        x = node.inputs[0]
+        if x.owner and x.owner.op == make_vector:
+            try:
+                idx, = node.op.idx_list
+            except:
+                #'how can you have multiple indexes into a shape?'
+                raise
+            if isinstance(idx, int):
+                return [x.owner.inputs[idx]]
+            elif isinstance(idx, T.TensorVariable):
+                # if it is a constant we can do something with it
+                try:
+                    v = get_constant_value(idx)
+                    return [x.owner.inputs[v]]
+                except:
+                    pass 
+            else:
+                # it is a slice of ints and/or Variables
+                #TODO: check subtensor to see if it can contain constant variables,
+                #      and if it can, then try to unpack them.
+                try:
+                    return [make_vector(*x.owner.inputs.__getitem__(idx))]
+                except TypeError:
+                    pass
+                except:
+                    _logger.error('failed to index with "%s"' % str(idx))
+                    raise
 ##################
 # Subtensor opts #