Updated local_alloc_elemwise to remove all allocs when possible and to add…

Updated local_alloc_elemwise to remove all allocs when possible and to add assert only when we cannot determine whether the shapes will be correct. Deprecated experimental.local_alloc_elemwise and enabled local_alloc_elemwise by default.

Updated local_alloc_elemwise to remove all allocs when possible and to add…
f9fc5dfd · Dustin Webb · 795ded70 · f9fc5dfd
--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1529,6 +1529,7 @@ def local_remove_useless_assert(node):
            return [assert_(node.inputs[0], *cond)]
+@register_specialize
 @gof.local_optimizer([T.Elemwise])
 def local_alloc_elemwise(node):
    """
@@ -1536,7 +1537,7 @@ def local_alloc_elemwise(node):
      -> elemwise(x, y.TensorType(no broadcast flag))
    elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-      -> elemwise(x, y.TensorType(no broadcast flag))
+      -> elemwise(x.dimshuffle(...), y.TensorType(no broadcast flag))
    BROADCAST CONDITION: the condition is that the one input that are
    not to be optimized to have the same broadcast pattern as the
@@ -1548,99 +1549,124 @@ def local_alloc_elemwise(node):
    """
    if not isinstance(node.op, T.Elemwise):
        return False
    if len(node.outputs) > 1:
-        #This is a supposition this code make that I'm not sure is always true.
+        # Ensure all outputs have the same broadcast pattern
+        # This is a supposition that I'm not sure is always true.
        assert all([list(o.type.broadcastable) == list(
                    node.outputs[0].type.broadcastable) for o in
                    node.outputs[1:]])
+    # The broadcast pattern of the ouptut must match the broadcast pattern of
+    # at least one of the inputs.
    if not any([list(i.type.broadcastable) == list(
-                node.outputs[0].type.broadcastable) for i in node.inputs]):
+        node.outputs[0].type.broadcastable) for i in node.inputs]):
        return False
-    if not any([i.owner and (isinstance(i.owner.op, T.Alloc) or \
-                             (isinstance(i.owner.op, T.DimShuffle) and
+    def dimshuffled_alloc(i):
-                              i.owner.inputs[0].owner and \
+        return (isinstance(i.owner.op, T.DimShuffle) and
-                              isinstance(i.owner.inputs[0].owner.op, T.Alloc)))
+                     i.owner.inputs[0].owner and \
+                         isinstance(i.owner.inputs[0].owner.op, T.Alloc))
+    # At least one input must have an owner that is either a T.Alloc or a
+    # T.DimShuffle with an owner that is a T.Alloc -- otherwise there is
+    # nothing to optimize.
+    if not any([i.owner
+                and (isinstance(i.owner.op, T.Alloc) or dimshuffled_alloc(i))
                for i in node.inputs]):
        return False
-    no_broad_idx = -1
+    ## Search for input that we can use as a baseline for the dimensions.
+    assert_op_idx = -1
    for idx, i in enumerate(node.inputs):
-        if not i.owner:
+        if i.type.broadcastable == node.outputs[0].type.broadcastable:
-            if list(i.type.broadcastable) == [False, ] * i.type.ndim:
+            # Prefer an input that is not a T.Alloc nor a T.DimShuffle of a
-                no_broad_idx = idx
+            # T.Alloc so that all allocs can be optimized.
+            if not (i.owner
+                    and (isinstance(i.owner.op, T.Alloc)
+                         or dimshuffled_alloc(i))):
+                assert_op_idx = idx
                break
-            else:
-                continue
-        if not any(i.type.broadcastable) and not isinstance(i.owner.op,
-                                                            T.Alloc):
-            no_broad_idx = idx
-            break
-        elif list(i.type.broadcastable) == list(
-            node.outputs[0].type.broadcastable) \
-            and not isinstance(i.owner.op, T.Alloc) \
-            and not (isinstance(i.owner.op, T.DimShuffle) and
-                     i.owner.inputs[0].owner and \
-                         isinstance(i.owner.inputs[0].owner.op, T.Alloc)):
-            no_broad_idx = idx
-            break
-    assert no_broad_idx >= 0
+    # It may be the case that only T.Allocs and T.DimShuffle of T.Allocs exist.
-    assert_op = node.inputs[no_broad_idx]
+    if assert_op_idx < 0:
+        # We want to optimize as many allocs as possible. When there is more
+        # than one then do all but one.
+        if len(node.inputs) > 1:
+            assert_op_idx = 0  # The first one is as good as any to use.
+        else:
+            # When there is only one input then we can optimize if the
+            # broadcast patterns of the input and output match.
+            i =  node.inputs[0]
+            if i.type.broadcastable == node.outputs[0].type.broadcastable:
+                new_i = []
+                if isinstance(i.owner.op, T.Alloc):
+                    new_i.append(i.owner.inputs[0])
+                elif dimshuffled_alloc(i):
+                    new_i.append(i.owner.inputs[0].owner.inputs[0])
+                assert(len(new_i) > 0)
+                return node.op(*new_i,
+                               return_list=True)
+            # Otherwise nothing can be done.
+            return False
+    assert_op = node.inputs[assert_op_idx]
    cmp_op = assert_op
-    new = []
+    new_i = []
    for i in node.inputs:
+        # Remove alloc
        if (i.owner and isinstance(i.owner.op, T.Alloc)
            and i.owner.inputs[0].type != i.owner.outputs[0].type):
            # when i.owner.inputs[0].type == i.owner.outputs[0].type we
            # will remove that alloc later
            assert i.type.ndim == cmp_op.ndim
-            if theano.config.experimental.local_alloc_elemwise_assert:
+            if (theano.config.experimental.local_alloc_elemwise_assert
+                and node.fgraph.shape_feature.same_shape(i, cmp_op)):
                assert_op = assert_(assert_op,
                                    *[T.eq(i.shape[idx], cmp_op.shape[idx])\
                                          for idx in xrange(i.type.ndim) \
                                          if not i.type.broadcastable[idx]])
-                new.append(i.owner.inputs[0])
+            new_i.append(i.owner.inputs[0])
-        elif i.owner and isinstance(i.owner.op, T.DimShuffle) \
-                and i.owner.inputs[0].owner \
+        # Remove Alloc in DimShuffle
-                and isinstance(i.owner.inputs[0].owner.op, T.Alloc):
+        elif i.owner and dimshuffled_alloc(i):
            assert i.type.ndim == cmp_op.type.ndim
-            if theano.config.experimental.local_alloc_elemwise_assert:
+            if (theano.config.experimental.local_alloc_elemwise_assert
+                and node.fgraph.shape_feature.same_shape(i, cmp_op)):
                assert_op = assert_(assert_op,
                                    *[T.eq(i.shape[idx], cmp_op.shape[idx])
                                      for idx in xrange(i.type.ndim)
                                      if not i.type.broadcastable[idx]])
-            new.append(i.owner.inputs[0].owner.inputs[0])
+            new_i.append(i.owner.inputs[0].owner.inputs[0])
        else:
-            new.append(i)
+            new_i.append(i)
-    new[no_broad_idx] = assert_op
+    new_i[assert_op_idx] = assert_op
    if theano.config.experimental.local_alloc_elemwise_assert:
        assert assert_op.owner.op is assert_
-    return [node.op(*new)]
+    return node.op(*new_i, return_list=True)
 #TODO, global optimizer that lift the assert to the beginning of the graph.
-#TODO, when all inputs can be optimized do all except one
+#TODO, optimize all inputs when possible -- currently when all inputs have
+# an alloc all but one is optimized.
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
        "If True enable the experimental optimization local_alloc_elemwise",
-        theano.configparser.BoolParam(False),
+                                 theano.configparser.BoolParam(
-        in_c_key=False)
+                                     False,
-#This version if faster but not as save.
+                                     is_valid=lambda x: return not x
+                                 ),
+                                 in_c_key=False)
+#This version if faster but not as safe.
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise_assert',
        "If False enable the experimental optimization local_alloc_elemwise"
                                 " but WITHOUT assert into the graph!",
        theano.configparser.BoolParam(True),
        in_c_key=False)
-if theano.config.experimental.local_alloc_elemwise:
-    #enabled by default when the lifter of assert is done.
-    register_specialize(local_alloc_elemwise)
-else:
-    #don't register them in fast_run by default to have them disabled
-    #by default disable them by default as we are not sure it is
-    #always a good idea to replace an alloc with multiple op.
-    compile.optdb['specialize'].register("local_alloc_elemwise",
-                                         local_alloc_elemwise)
 ############################
 # Constant Canonicalization