Merge pull request #2195 from daemonmaker/issue1903

Updated local_alloc_elemwise to remove all allocs when possible and to a...

Merge pull request #2195 from daemonmaker/issue1903
918c51cb · Frédéric Bastien · 165eb4e6 · 3867a916 · 918c51cb · 918c51cb
--- a/doc/library/config.txt
+++ b/doc/library/config.txt
@@ -462,6 +462,20 @@ import theano and print the config variable, as in:
    Link arguments to link against a (Fortran) level-3 blas implementation.
+.. attribute:: config.experimental.local_alloc_elemwise_assert
+    Bool value: either True or False
+    Default: True
+    When the local_alloc_optimization is applied, add an assert to highlight
+    shape errors.
+    Without such asserts this optimization could hide errors in the user code.
+    We add the assert only if we can't infer that the shapes are equivalent.
+    As such this optimization does not always introduce an assert in the graph.
+    Removing the assert could speed up execution.
 .. attribute:: config.cuda.root
    Default: $CUDA_ROOT or failing that, "/usr/local/cuda"

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -1408,6 +1408,7 @@ def local_useless_elemwise(node):
            return [node.inputs[0]]
        if node.op.scalar_op == theano.scalar.add and len(node.inputs) == 1:
            return [node.inputs[0]]
        if (node.op.scalar_op == theano.scalar.identity
            and len(node.inputs) == 1):
            return [node.inputs[0]]
@@ -1529,14 +1530,15 @@ def local_remove_useless_assert(node):
            return [assert_(node.inputs[0], *cond)]
+@register_specialize
 @gof.local_optimizer([T.Elemwise])
 def local_alloc_elemwise(node):
    """
    elemwise(alloc(x, shp), ..., y.TensorType(BROADCAST CONDITION))
-      -> elemwise(x, y.TensorType(no broadcast flag))
+      -> elemwise(x, y.TensorType(BROADCAST CONDITION))
    elemwise(dimshuffle(alloc(x, shp)),... ,y.TensorType(BROADCAST CONDITION))
-      -> elemwise(x, y.TensorType(no broadcast flag))
+      -> elemwise(x.dimshuffle(...), y.TensorType(BROADCAST CONDITION))
    BROADCAST CONDITION: the condition is that the one input that are
    not to be optimized to have the same broadcast pattern as the
@@ -1548,99 +1550,122 @@ def local_alloc_elemwise(node):
    """
    if not isinstance(node.op, T.Elemwise):
        return False
    if len(node.outputs) > 1:
-        #This is a supposition this code make that I'm not sure is always true.
+        # Ensure all outputs have the same broadcast pattern
-        assert all([list(o.type.broadcastable) == list(
+        # This is a supposition that I'm not sure is always true.
-                    node.outputs[0].type.broadcastable) for o in
+        assert all([o.type.broadcastable == 
+                    node.outputs[0].type.broadcastable for o in
                    node.outputs[1:]])
-    if not any([list(i.type.broadcastable) == list(
+    # The broadcast pattern of the ouptut must match the broadcast pattern of
-                node.outputs[0].type.broadcastable) for i in node.inputs]):
+    # at least one of the inputs.
+    if not any([i.type.broadcastable == 
+        node.outputs[0].type.broadcastable for i in node.inputs]):
        return False
-    if not any([i.owner and (isinstance(i.owner.op, T.Alloc) or \
-                             (isinstance(i.owner.op, T.DimShuffle) and
+    def dimshuffled_alloc(i):
-                              i.owner.inputs[0].owner and \
+        return (isinstance(i.owner.op, T.DimShuffle) and
-                              isinstance(i.owner.inputs[0].owner.op, T.Alloc)))
+                     i.owner.inputs[0].owner and \
+                         isinstance(i.owner.inputs[0].owner.op, T.Alloc))
+    # At least one input must have an owner that is either a T.Alloc or a
+    # T.DimShuffle with an owner that is a T.Alloc -- otherwise there is
+    # nothing to optimize.
+    if not any([i.owner
+                and (isinstance(i.owner.op, T.Alloc) or dimshuffled_alloc(i))
                for i in node.inputs]):
        return False
-    no_broad_idx = -1
+    ## Search for input that we can use as a baseline for the dimensions.
+    assert_op_idx = -1
    for idx, i in enumerate(node.inputs):
-        if not i.owner:
+        if i.type.broadcastable == node.outputs[0].type.broadcastable:
-            if list(i.type.broadcastable) == [False, ] * i.type.ndim:
+            # Prefer an input that is not a T.Alloc nor a T.DimShuffle of a
-                no_broad_idx = idx
+            # T.Alloc so that all allocs can be optimized.
+            if not (i.owner
+                    and (isinstance(i.owner.op, T.Alloc)
+                         or dimshuffled_alloc(i))):
+                assert_op_idx = idx
                break
-            else:
-                continue
-        if not any(i.type.broadcastable) and not isinstance(i.owner.op,
-                                                            T.Alloc):
-            no_broad_idx = idx
-            break
-        elif list(i.type.broadcastable) == list(
-            node.outputs[0].type.broadcastable) \
-            and not isinstance(i.owner.op, T.Alloc) \
-            and not (isinstance(i.owner.op, T.DimShuffle) and
-                     i.owner.inputs[0].owner and \
-                         isinstance(i.owner.inputs[0].owner.op, T.Alloc)):
-            no_broad_idx = idx
-            break
-    assert no_broad_idx >= 0
+    # It may be the case that only T.Allocs and T.DimShuffle of T.Allocs exist.
-    assert_op = node.inputs[no_broad_idx]
+    if assert_op_idx < 0:
+        # We want to optimize as many allocs as possible. When there is more
+        # than one then do all but one.
+        # number of inputs with alloc or dimshuffle alloc
+        l2 = [i for i in node.inputs
+              if (i.owner and (isinstance(i.owner.op, T.Alloc)
+                         or dimshuffled_alloc(i)))]
+        # If only 1 alloc or dimshuffle alloc, it is the one we will use for the shape
+        # So no alloc would be removed.
+        if len(l2) > 1:
+            # l containt inputs with alloc or dimshuffle alloc only.
+            # Its length will always be at least one, as we checked that before
+            l = [idx for idx, i in enumerate(node.inputs)
+                 if i.type.broadcastable == node.outputs[0].type.broadcastable]
+            assert_op_idx = l[0]  # The first one is as good as any to use.
+        else:
+            # Nothing would be optimized!
+            return False
+    assert_op = node.inputs[assert_op_idx]
    cmp_op = assert_op
-    new = []
+    new_i = []
    for i in node.inputs:
+        # Remove alloc
        if (i.owner and isinstance(i.owner.op, T.Alloc)
            and i.owner.inputs[0].type != i.owner.outputs[0].type):
            # when i.owner.inputs[0].type == i.owner.outputs[0].type we
            # will remove that alloc later
            assert i.type.ndim == cmp_op.ndim
-            if theano.config.experimental.local_alloc_elemwise_assert:
+            if (theano.config.experimental.local_alloc_elemwise_assert
+                and not node.fgraph.shape_feature.same_shape(i, cmp_op)):
                assert_op = assert_(assert_op,
                                    *[T.eq(i.shape[idx], cmp_op.shape[idx])\
                                          for idx in xrange(i.type.ndim) \
                                          if not i.type.broadcastable[idx]])
-                new.append(i.owner.inputs[0])
+            new_i.append(i.owner.inputs[0])
-        elif i.owner and isinstance(i.owner.op, T.DimShuffle) \
-                and i.owner.inputs[0].owner \
+        # Remove Alloc in DimShuffle
-                and isinstance(i.owner.inputs[0].owner.op, T.Alloc):
+        elif i.owner and dimshuffled_alloc(i):
            assert i.type.ndim == cmp_op.type.ndim
-            if theano.config.experimental.local_alloc_elemwise_assert:
+            if (theano.config.experimental.local_alloc_elemwise_assert
+                and not node.fgraph.shape_feature.same_shape(i, cmp_op)):
                assert_op = assert_(assert_op,
                                    *[T.eq(i.shape[idx], cmp_op.shape[idx])
                                      for idx in xrange(i.type.ndim)
                                      if not i.type.broadcastable[idx]])
-            new.append(i.owner.inputs[0].owner.inputs[0])
+            new_i.append(i.owner.inputs[0].owner.inputs[0])
        else:
-            new.append(i)
+            new_i.append(i)
-    new[no_broad_idx] = assert_op
+    new_i[assert_op_idx] = assert_op
-    if theano.config.experimental.local_alloc_elemwise_assert:
-        assert assert_op.owner.op is assert_
+    return node.op(*new_i, return_list=True)
-    return [node.op(*new)]
 #TODO, global optimizer that lift the assert to the beginning of the graph.
-#TODO, when all inputs can be optimized do all except one
+#TODO, optimize all inputs when possible -- currently when all inputs have
+# an alloc all but one is optimized.
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise',
-        "If True enable the experimental optimization local_alloc_elemwise",
+                                 "DEPRECATED: If True, enable the experimental"
-        theano.configparser.BoolParam(False),
+                                 " optimization local_alloc_elemwise."
-        in_c_key=False)
+                                 " Generates error if not True. Use"
-#This version if faster but not as save.
+                                 " optimizer_excluding=local_alloc_elemwise"
+                                 " to dsiable.",
+                                 theano.configparser.BoolParam(
+                                     True,
+                                     is_valid=lambda x: x
+                                 ),
+                                 in_c_key=False)
+#This version if faster but not as safe.
 theano.configparser.AddConfigVar('experimental.local_alloc_elemwise_assert',
        "If False enable the experimental optimization local_alloc_elemwise"
                                 " but WITHOUT assert into the graph!",
        theano.configparser.BoolParam(True),
        in_c_key=False)
-if theano.config.experimental.local_alloc_elemwise:
-    #enabled by default when the lifter of assert is done.
-    register_specialize(local_alloc_elemwise)
-else:
-    #don't register them in fast_run by default to have them disabled
-    #by default disable them by default as we are not sure it is
-    #always a good idea to replace an alloc with multiple op.
-    compile.optdb['specialize'].register("local_alloc_elemwise",
-                                         local_alloc_elemwise)
 ############################
 # Constant Canonicalization

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2512,6 +2512,156 @@ def test_local_subtensor_of_dot():
    f = theano.function([m1, m2, idx], theano.dot(m1, m2)[1:4,:,idx:,idx], mode=mode)
    assert test_equality(f(d1, d2, 1), numpy.dot(d1, d2)[1:4,:,1:,1])
+class Test_local_alloc_elemwise(unittest.TestCase):
+    dtype = config.floatX
+    def setUp(self):
+        self.vec = T.vector('vec', dtype=theano.config.floatX)
+        self.mat = T.matrix('mat', dtype=theano.config.floatX)
+        self.tens = T.tensor3('tens', dtype=theano.config.floatX)
+        self.alloc_wo_dep = T.alloc(self.vec, 2, 2)
+        self.alloc_w_dep = T.alloc(self.vec, *self.mat.shape)
+    def _verify_alloc_count(self, f, count):
+        assert(
+            sum([isinstance(elem.op, T.Alloc)
+                 for elem in f.maker.fgraph.toposort()
+                 if elem.op is not None]) == count
+        )
+    def _verify_assert_count(self, f, count):
+        assert(
+            sum([isinstance(elem.op, T.opt.Assert)
+                 for elem in f.maker.fgraph.toposort()
+                 if elem.op is not None]) == count
+        )
+    def test_remove_alloc_wo_dimshuffle(self):
+        # No optimization on alloc
+        func = function(
+            [self.vec, self.mat],
+            self.alloc_wo_dep + self.mat,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        # Optimization on alloc with assert
+        func = function(
+            [self.vec, self.mat],
+            self.alloc_wo_dep + self.mat,
+            mode='FAST_RUN'
+        )
+        self._verify_alloc_count(func, 0)
+        self._verify_assert_count(func, 1)
+        # No optimization on alloc without assert
+        func = function(
+            [self.vec, self.mat],
+            self.alloc_w_dep + self.mat,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        # Optimization on alloc without assert
+        func = function(
+            [self.vec, self.mat],
+            self.alloc_w_dep + self. mat,
+            mode='FAST_RUN'
+        )
+        self._verify_alloc_count(func, 0)
+        self._verify_assert_count(func, 0)
+    def test_remove_alloc_w_dimshuffle(self):
+        # No optimization on dimshuffle with assert
+        func = function(
+            [self.vec, self.tens],
+            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        # Optimization on dimshuffle with assert
+        func = function(
+            [self.vec, self.tens],
+            T.alloc(self.vec, 2, 2).dimshuffle(0, 1, 'x') + self.tens,
+            mode='FAST_RUN'
+        )
+        self._verify_alloc_count(func, 0)
+        self._verify_assert_count(func, 1)
+        # No optimization on dimshuffle without assert
+        func = function(
+            [self.vec, self.tens],
+            T.alloc(
+                self.vec,
+                self.tens.shape[0],
+                self.tens.shape[1]
+            ).dimshuffle(0, 1, 'x') + self.tens,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        # Optimization on dimshuffle without assert
+        func = function(
+            [self.vec, self.tens],
+            T.alloc(
+                self.vec,
+                self.tens.shape[0],
+                self.tens.shape[1]
+            ).dimshuffle(0, 1, 'x') + self.tens,
+            mode='FAST_RUN'
+        )
+        self._verify_alloc_count(func, 0)
+        self._verify_assert_count(func, 0)
+    def test_multi_input_single_alloc(self):
+        tv = T.alloc(self.vec, 5, 5)
+        tm = T.alloc(self.mat, 5, 5, 5)
+        func = function(
+            [self.vec, self.mat],
+            tv + tm,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 2)
+        self._verify_assert_count(func, 0)
+        func = function(
+            [self.vec, self.mat],
+            tv + tm,
+            mode='FAST_RUN'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 0)
+        s = T.iscalar('s')
+        tv = T.alloc(self.vec, s, s)
+        tm = T.alloc(self.mat, 5, 5, 5)
+        func = function(
+            [self.vec, self.mat, s],
+            tv + tm,
+            mode='FAST_COMPILE'
+        )
+        self._verify_alloc_count(func, 2)
+        self._verify_assert_count(func, 0)
+        func = function(
+            [self.vec, self.mat, s],
+            tv + tm,
+            mode='FAST_RUN'
+        )
+        self._verify_alloc_count(func, 1)
+        self._verify_assert_count(func, 1)
 def test_local_subtensor_of_alloc():
    # DebugMode should detect if something goes wrong.