Merge pull request #4958 from nouiz/opt_speedup

Lower the number of iteration for local_add_mul_fusion

Merge pull request #4958 from nouiz/opt_speedup
67e5e2eb · Frédéric Bastien · GitHub · 3284771f · 06d83438 · 67e5e2eb
--- a/theano/gpuarray/dnn.py
+++ b/theano/gpuarray/dnn.py
@@ -1005,6 +1005,9 @@ class GpuDnnPoolDesc(Op):
    pad : tuple
        (padX, padY) or (padX, padY, padZ)

+    Note
+    ----
+    Not used anymore. Only needed to reload old pickled files.
    """

    __props__ = ('ws', 'stride', 'mode', 'pad')

--- a/theano/sandbox/cuda/dnn.py
+++ b/theano/sandbox/cuda/dnn.py
@@ -1365,6 +1365,10 @@ class GpuDnnPoolDesc(GpuOp):
        pad_w is the number of zero-valued pixels added to each of the left and
        right borders.

+    Note
+    ----
+    Not used anymore. Only needed to reload old pickled files.
+
    """

    __props__ = ('ws', 'stride', 'mode', 'pad')

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -576,7 +576,8 @@ get_scalar_constant_value_elemwises = (


 def get_scalar_constant_value(orig_v, elemwise=True,
-                              only_process_constants=False):
+                              only_process_constants=False,
+                              max_recur=10):
    """Return the constant scalar(0-D) value underlying variable `v`.

    If `v` is the output of dimshuffles, fills, allocs, rebroadcasts,
@@ -596,6 +597,8 @@ def get_scalar_constant_value(orig_v, elemwise=True,
        If True, we only attempt to obtain the value of `orig_v` if it's
        directly constant and don't try to dig through dimshuffles, fills,
        allocs, and other to figure out its value.
+    max_recur : int
+        The maximum number of recursion.

    Notes
    -----
@@ -623,7 +626,10 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                data = v.data
            return numpy_scalar(data).copy()

-        if not only_process_constants and getattr(v, 'owner', None):
+        if (not only_process_constants and
+                getattr(v, 'owner', None) and
+                max_recur > 0):
+            max_recur -= 1
            if isinstance(v.owner.op, (Alloc, DimShuffle, Rebroadcast,
                                       compile.ops.OutputGuard,
                                       compile.DeepCopyOp)):
@@ -645,7 +651,8 @@ def get_scalar_constant_value(orig_v, elemwise=True,
            # We put all the scalar Ops used by get_canonical_form_slice()
            # to allow it to determine the broadcast pattern correctly.
            elif isinstance(v.owner.op, (ScalarFromTensor, TensorFromScalar)):
-                return get_scalar_constant_value(v.owner.inputs[0])
+                v = v.owner.inputs[0]
+                continue
            elif isinstance(v.owner.op, scal.ScalarOp):
                if isinstance(v.owner.op, scal.Second):
                    # We don't need both input to be constant for second
@@ -653,7 +660,7 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                    v = val
                    continue
                if isinstance(v.owner.op, get_scalar_constant_value_elemwises):
-                    const = [get_scalar_constant_value(i)
+                    const = [get_scalar_constant_value(i, max_recur=max_recur)
                             for i in v.owner.inputs]
                    ret = [[None]]
                    v.owner.op.perform(v.owner, const, ret)
@@ -670,7 +677,7 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                elif elemwise and isinstance(
                        v.owner.op.scalar_op,
                        get_scalar_constant_value_elemwises):
-                    const = [get_scalar_constant_value(i)
+                    const = [get_scalar_constant_value(i, max_recur=max_recur)
                             for i in v.owner.inputs]
                    ret = [[None]]
                    v.owner.op.perform(v.owner, const, ret)
@@ -705,27 +712,33 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                                  v.owner.inputs[0].owner.inputs[1:]):
                        idx = v.owner.op.idx_list[0]
                        if isinstance(idx, gof.Type):
-                            idx = get_scalar_constant_value(v.owner.inputs[1])
+                            idx = get_scalar_constant_value(v.owner.inputs[1],
+                                                            max_recur=max_recur)
                        # Note the '+ 1' is because the first argument to Join
                        # is the axis.
                        ret = v.owner.inputs[0].owner.inputs[idx + 1]
-                        ret = get_scalar_constant_value(ret)
+                        ret = get_scalar_constant_value(ret, max_recur=max_recur)
                        # join can cast implicitly its input in some case.
                        return theano._asarray(ret, dtype=v.type.dtype)
                    if python_all(var.ndim == 1 for var in
                                  v.owner.inputs[0].owner.inputs[1:]):
                        idx = v.owner.op.idx_list[0]
                        if isinstance(idx, gof.Type):
-                            idx = get_scalar_constant_value(v.owner.inputs[1])
+                            idx = get_scalar_constant_value(v.owner.inputs[1],
+                                                            max_recur=max_recur)
                        try:
                            # TODO: assert joined axis is 0.
                            length = 0
+                            loop = False
                            for joined in v.owner.inputs[0].owner.inputs[1:]:
                                ll = get_vector_length(joined)
                                if idx < length + ll:
-                                    return get_scalar_constant_value(
-                                        joined[idx - length])
+                                    v = joined[idx - length]
+                                    loop = True
+                                    break
                                length += ll
+                            if loop:
+                                continue
                        except TypeError:
                            pass
                        except ValueError:
@@ -742,12 +755,13 @@ def get_scalar_constant_value(orig_v, elemwise=True,

                    idx = v.owner.op.idx_list[0]
                    if isinstance(idx, gof.Type):
-                        idx = get_scalar_constant_value(v.owner.inputs[1])
+                        idx = get_scalar_constant_value(v.owner.inputs[1],
+                                                        max_recur=max_recur)
                    # Python 2.4 does not support indexing with numpy.integer
                    # So we cast it.
                    idx = int(idx)
                    ret = v.owner.inputs[0].owner.inputs[idx]
-                    ret = get_scalar_constant_value(ret)
+                    ret = get_scalar_constant_value(ret, max_recur=max_recur)
                    # MakeVector can cast implicitly its input in some case.
                    return theano._asarray(ret, dtype=v.type.dtype)

@@ -762,7 +776,8 @@ def get_scalar_constant_value(orig_v, elemwise=True,
                    idx_list = op.idx_list
                    idx = idx_list[0]
                    if isinstance(idx, gof.Type):
-                        idx = get_scalar_constant_value(owner.inputs[1])
+                        idx = get_scalar_constant_value(owner.inputs[1],
+                                                        max_recur=max_recur)
                    grandparent = leftmost_parent.owner.inputs[0]
                    gp_broadcastable = grandparent.type.broadcastable
                    ndim = grandparent.type.ndim

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -7130,7 +7130,7 @@ def local_add_mul_fusion(node):
    """Fuse consecutive add or mul in one such node with more inputs.

    It is better to fuse add/mul that way then in a Composite node as
-    this make the inner graph of the Compiste smaller. This allow to
+    this make the inner graph of the Composite smaller. This allow to
    put more computation in a Composite before hitting the max
    recusion limit when pickling Composite.

@@ -7140,16 +7140,30 @@ def local_add_mul_fusion(node):
        return False

    s_op = node.op.scalar_op.__class__
+    new_inp = []
+    fused = False
    for inp in node.inputs:
        if (inp.owner and
                isinstance(inp.owner.op, Elemwise) and
                isinstance(inp.owner.op.scalar_op, s_op)):
-            l = list(node.inputs)
-            l.remove(inp)
-            output_node = node.op(*(l + inp.owner.inputs))
-
-            copy_stack_trace(node.outputs[0], output_node)
-            return [output_node]
+            new_inp.extend(inp.owner.inputs)
+            fused = True
+        else:
+            new_inp.append(inp)
+
+    # We ca not compare the number of inputs as Mul and Add could have
+    # 0 or 1 inputs in some corner cases.
+    if fused:
+        output = node.op(*new_inp)
+        copy_stack_trace(node.outputs[0], output)
+
+        # Do the recursion here to help lower the number of
+        # FusionOptimizer iteration.
+        if output.owner:
+            output2 = local_add_mul_fusion(output.owner)
+            if output2:
+                return output2
+        return [output]

 if config.tensor.local_elemwise_fusion:
    _logger.debug("enabling optimization fusion elemwise in fast_run")

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -398,7 +398,7 @@ class Subtensor(Op):
            raise AdvancedIndexingError(Subtensor.e_indextype, entry)

    def get_constant_idx(self, inputs, allow_partial=False,
-                         only_process_constants=False):
+                         only_process_constants=False, elemwise=True):
        """
        Return the idx_list with constant inputs replaced by their
        python scalar equivalent.
@@ -442,7 +442,8 @@ class Subtensor(Op):
                try:
                    return get_scalar_constant_value(
                        val,
-                        only_process_constants=only_process_constants)
+                        only_process_constants=only_process_constants,
+                        elemwise=elemwise)
                except theano.tensor.NotScalarConstantError:
                    if allow_partial:
                        return val

--- a/theano/tensor/tests/test_opt.py
+++ b/theano/tensor/tests/test_opt.py
@@ -2048,9 +2048,9 @@ class test_local_subtensor_lift(unittest.TestCase):
                Subtensor, tensor.DimShuffle]))

        prog = f.maker.fgraph.toposort()
-        assert isinstance(prog[0].op, tensor.DimShuffle)
-        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
-        assert isinstance(prog[2].op, tensor.Subtensor)  # first subtensor
+        assert isinstance(prog[0].op, tensor.Subtensor)
+        assert isinstance(prog[1].op, tensor.DimShuffle)
+        assert isinstance(prog[2].op, tensor.Subtensor)
        assert isinstance(prog[3].op.scalar_op, theano.scalar.
                          Composite)  # Composite{add,add}
        assert len(prog) == 4
@@ -2069,9 +2069,9 @@ class test_local_subtensor_lift(unittest.TestCase):
                Subtensor, tensor.DimShuffle]))

        prog = f.maker.fgraph.toposort()
-        assert isinstance(prog[0].op, tensor.DimShuffle)
-        assert isinstance(prog[1].op, tensor.Subtensor)  # first subtensor
-        assert isinstance(prog[2].op, tensor.Subtensor)  # first subtensor
+        assert isinstance(prog[0].op, tensor.Subtensor)
+        assert isinstance(prog[1].op, tensor.DimShuffle)
+        assert isinstance(prog[2].op, tensor.Subtensor)
        assert isinstance(prog[3].op.scalar_op, theano.scalar.
                          Composite)  # Composite{add,add}
        assert len(prog) == 4