Merge pull request #5562 from nouiz/opt_prof

Speed merge optimizer and allow partial optimizer profile

Merge pull request #5562 from nouiz/opt_prof
d704a600 · abergeron · GitHub · 4e482807 · 80d3b56a · d704a600
--- a/theano/compile/function_module.py
+++ b/theano/compile/function_module.py
@@ -1466,6 +1466,10 @@ class FunctionMaker(object):
                theano.config.traceback.limit = theano.config.traceback.compile_limit
                start_optimizer = time.time()
+                # In case there is an error during optimization.
+                optimizer_profile = None
+                opt_time = None
                # now optimize the graph
                if theano.config.cache_optimizations:
                    optimizer_profile = self.optimize_graph_with_cache(
@@ -1475,8 +1479,23 @@ class FunctionMaker(object):
                end_optimizer = time.time()
                opt_time = end_optimizer - start_optimizer
+                _logger.debug('Optimizing took %f seconds', opt_time)
+                # Add deep copy to respect the memory interface
+                insert_deepcopy(fgraph, inputs, outputs + additional_outputs)
+            finally:
+                theano.config.compute_test_value = compute_test_value_orig
+                theano.config.traceback.limit = limit_orig
+                # If the optimizer got interrupted
+                if opt_time is None:
+                    end_optimizer = time.time()
+                    opt_time = end_optimizer - start_optimizer
                theano.compile.profiling.total_graph_opt_time += opt_time
                if profile:
+                    if (optimizer_profile is None and
+                            hasattr(optimizer, 'pre_profile')):
+                        optimizer_profile = optimizer.pre_profile
                    profile.optimizer_time += opt_time
                    if theano.config.profile_optimizer:
                        profile.optimizer_profile = (optimizer,
@@ -1486,13 +1505,6 @@ class FunctionMaker(object):
                    warnings.warn((
                        "config.profile_optimizer requires config.profile to "
                        " be set to True as well"), stacklevel=3)
-                _logger.debug('Optimizing took %f seconds', opt_time)
-                # Add deep copy to respect the memory interface
-                insert_deepcopy(fgraph, inputs, outputs + additional_outputs)
-            finally:
-                theano.config.compute_test_value = compute_test_value_orig
-                theano.config.traceback.limit = limit_orig
        # initialize the linker
        if not hasattr(linker, 'accept'):
@@ -1784,7 +1796,7 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
    if isinstance(mode, (list, tuple)):  # "mode comparison" semantics
        raise Exception("We do not support the passing of multiple modes")
-    else:
+    try:
        Maker = getattr(mode, 'function_maker', FunctionMaker)
        fn = Maker(inputs,
                   outputs,
@@ -1794,11 +1806,12 @@ def orig_function(inputs, outputs, mode=None, accept_inplace=False,
                   on_unused_input=on_unused_input,
                   output_keys=output_keys).create(
            defaults)
+    finally:
-    t2 = time.time()
+        t2 = time.time()
-    if profile:
+        if profile:
-        profile.compile_time += t2 - t1
+            profile.compile_time += t2 - t1
-        profile.nb_nodes = len(fn.maker.fgraph.apply_nodes)
+            # TODO: append
+            profile.nb_nodes = len(fn.maker.fgraph.apply_nodes)
    fn.name = name
    fn.maker.fgraph.name = name

--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -228,45 +228,53 @@ class SeqOptimizer(Optimizer, list):
        nb_node_before = len(fgraph.apply_nodes)
        sub_profs = []
        nb_nodes = []
-        for optimizer in self:
-            try:
-                nb_nodes_before = len(fgraph.apply_nodes)
-                t0 = time.time()
-                sub_prof = optimizer.optimize(fgraph)
-                l.append(float(time.time() - t0))
-                sub_profs.append(sub_prof)
-                nb_nodes.append((nb_nodes_before,
-                                 len(fgraph.apply_nodes)))
-                if fgraph.profile:
-                    sub_validate_time.append(fgraph.profile.validate_time)
-            except AssertionError:
-                # do not catch Assertion failures
-                raise
-            except Exception as e:
-                if self.failure_callback:
-                    self.failure_callback(e, self, optimizer)
-                    continue
-                else:
-                    raise
-        if fgraph.profile:
+        self.pre_profile = (
-            validate_time = fgraph.profile.validate_time - validate_before
+            self, l, -1, -1, nb_node_before,
-            callbacks_time = {}
+            -1, sub_profs, sub_validate_time,
-            for k, v in iteritems(fgraph.execute_callbacks_times):
+            nb_nodes, {})
-                if k in callbacks_before:
+        try:
-                    t = v - callbacks_before[k]
+            for optimizer in self:
-                    if t > 0:
+                try:
-                        callbacks_time[k] = t
+                    nb_nodes_before = len(fgraph.apply_nodes)
-                else:
+                    t0 = time.time()
-                    callbacks_time[k] = v
+                    sub_prof = optimizer.optimize(fgraph)
-        else:
+                    l.append(float(time.time() - t0))
-            validate_time = None
+                    sub_profs.append(sub_prof)
-            callbacks_time = {}
+                    nb_nodes.append((nb_nodes_before,
+                                     len(fgraph.apply_nodes)))
+                    if fgraph.profile:
+                        sub_validate_time.append(fgraph.profile.validate_time)
+                except AssertionError:
+                    # do not catch Assertion failures
+                    raise
+                except Exception as e:
+                    if self.failure_callback:
+                        self.failure_callback(e, self, optimizer)
+                        continue
+                    else:
+                        raise
+        finally:
-        callback_time = fgraph.execute_callbacks_time - callback_before
+            if fgraph.profile:
-        return (self, l, validate_time, callback_time, nb_node_before,
+                validate_time = fgraph.profile.validate_time - validate_before
+                callbacks_time = {}
+                for k, v in iteritems(fgraph.execute_callbacks_times):
+                    if k in callbacks_before:
+                        t = v - callbacks_before[k]
+                        if t > 0:
+                            callbacks_time[k] = t
+                    else:
+                        callbacks_time[k] = v
+            else:
+                validate_time = None
+                callbacks_time = {}
+            callback_time = fgraph.execute_callbacks_time - callback_before
+            self.pre_profile = (
+                self, l, validate_time, callback_time, nb_node_before,
                len(fgraph.apply_nodes), sub_profs, sub_validate_time,
                nb_nodes, callbacks_time)
+        return self.pre_profile
    def __str__(self):
        return "SeqOpt(%s)" % list.__str__(self)
@@ -877,7 +885,13 @@ class MergeOptimizer(Optimizer):
                        pairs = [(pairs[0][1], pairs[0][0])]
                try:
-                    fgraph.replace_all_validate(pairs, 'MergeOptimizer')
+                    # If all Constants, no need to call validate.
+                    # Only need to check one of the var of each pairs.
+                    # If it is a Constant, the other must also be a Constant as we merge them.
+                    if all([isinstance(old, graph.Constant) for old, new in pairs]):
+                        fgraph.replace_all(pairs, 'MergeOptimizer')
+                    else:
+                        fgraph.replace_all_validate(pairs, 'MergeOptimizer')
                except InconsistencyError:
                    success = False
                    nb_fail += 1

--- a/theano/gof/tests/test_graph.py
+++ b/theano/gof/tests/test_graph.py
@@ -359,9 +359,7 @@ class TestAutoName:
        assert r1 is r2
        r3 = tensor.constant(1.6)
-        # The cache still create a new object that we don't return.
+        assert r3.auto_name == "auto_" + str(autoname_id + 1)
-        # This is why we must increase by 2 and not 1.
-        assert r3.auto_name == "auto_" + str(autoname_id + 2)
    def test_tensorvariable(self):
        # Get counter value

--- a/theano/scan_module/scan_utils.py
+++ b/theano/scan_module/scan_utils.py
@@ -855,50 +855,77 @@ class Validator(object):
        If out is not valid and has no equivalent, None is returned.
        """
-        if out in self.valid:
-            return out, True
+        def get_value(out):
-        elif out in self.valid_equivalent:
+            if out in self.valid:
-            return self.valid_equivalent[out], False
+                return out, True
-        elif out in self.invalid:
+            elif out in self.valid_equivalent:
-            return None
+                return self.valid_equivalent[out], False
+            elif out in self.invalid:
-        if out.owner is None:
+                return None
-            if isinstance(out, tensor.TensorConstant):
+            else:
-                # This might be a constant from the outer graph or a constant
+                raise RuntimeError("This should not happen")
-                # from the inner graph. In all cases, we can clone it to be
-                # certain we have a valid constant
+        q = [out]
-                cloned_out = out.clone()
+        while q:
-                self.valid.add(cloned_out)
+            out = q.pop()
+            if out in self.valid:
+                continue
+            elif out in self.invalid:
+                continue
+            if out.owner is None:
+                if isinstance(out, tensor.TensorConstant):
+                    if hasattr(out, 'fgraph'):
+                        # If out have an fgraph, we aren't sure if it
+                        # is from the inner graph or outer graph, so
+                        # clone it.
+                        cloned_out = out.clone()
+                        self.valid.add(cloned_out)
+                        self.invalid.add(out)
+                        self.valid_equivalent[out] = cloned_out
+                    else:
+                        self.valid.add(out)
+                    continue
+                else:
+                    # This is an input node and it has not been
+                    # explicitly marked as invalid so we can use it
+                    self.valid.add(out)
+                    continue
+            # Process the input if needed
+            continue_while = False
+            for inp in out.owner.inputs:
+                if inp not in self.valid and inp not in self.invalid:
+                    q.append(out)
+                    q.extend(out.owner.inputs)
+                    continue_while = True
+                    break
+            if continue_while:
+                continue
+            inputs = [get_value(i) for i in out.owner.inputs]
+            # If some inputs are invalid without equivalent, so is out
+            if None in inputs:
+                self.invalid.add(out)
+                continue
+            # If some inputs are invalid with equivalent,
+            # an equivalent out should be built and returned
+            all_inputs = [inp for (inp, is_valid) in inputs]
+            equiv_inputs = [inp for (inp, is_valid) in inputs if not is_valid]
+            if equiv_inputs:
+                cloned_node = out.owner.clone_with_new_inputs(all_inputs)
+                cloned_out = cloned_node.outputs[out.index]
                self.invalid.add(out)
+                self.valid.add(cloned_out)
                self.valid_equivalent[out] = cloned_out
-                return cloned_out, False
+                continue
-            else:
-                # This is an input node and it has not been explicitly marked
+            # All inputs are valid, so is out
-                # as invalid so we can use it
+            self.valid.add(out)
-                return out, True
-        # Recurse over inputs
+        return get_value(out)
-        inputs = [self.check(i) for i in out.owner.inputs]
-        # If some inputs are invalid without equivalent, so is out
-        if None in inputs:
-            self.invalid.add(out)
-            return None
-        # If some inputs are invalid with equivalent,
-        # an equivalent out should be built and returned
-        all_inputs = [inp for (inp, is_valid) in inputs]
-        equiv_inputs = [inp for (inp, is_valid) in inputs if not is_valid]
-        if equiv_inputs:
-            cloned_node = out.owner.clone_with_new_inputs(all_inputs)
-            cloned_out = cloned_node.outputs[out.index]
-            self.invalid.add(out)
-            self.valid.add(cloned_out)
-            self.valid_equivalent[out] = cloned_out
-            return cloned_out, False
-        # All inputs are valid, so is out
-        return out, True
 def scan_can_remove_outs(op, out_idxs):

--- a/theano/scan_module/tests/test_scan_opt.py
+++ b/theano/scan_module/tests/test_scan_opt.py
@@ -113,10 +113,7 @@ class TestGaussNewton(unittest.TestCase):
    def test_nobatch(self):
        # This used to give an error due to optimization "scan_merge_inouts".
        # The batch size is set to 1 and the data is represented by a matrix.
-        # As of 2013-10-24, it still triggers an optimization error due to
+        self._run(100, 10, batch_size=1, mode=mode)
-        # "remove_constants_and_unused_inputs_scan".
-        mode_exc = mode.excluding("remove_constants_and_unused_inputs_scan")
-        self._run(100, 10, batch_size=1, mode=mode_exc)
 class GaussNewtonMatrix(object):

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -19,7 +19,7 @@ from theano.gof.type import Generic
 from theano.tensor import elemwise
 from theano.tensor.var import (AsTensorError, TensorVariable,
-                               TensorConstant,
+                               TensorConstant, TensorConstantSignature,
                               _tensor_py_operators)
 from theano.tensor.type import TensorType, values_eq_approx_always_true
 from theano.tensor.type_other import NoneConst
@@ -220,7 +220,7 @@ _as_tensor_variable = as_tensor_variable
 as_tensor = as_tensor_variable
-def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
+def constant(x, name=None, ndim=None, dtype=None):
    """Return a symbolic `Constant` with value `x`.
    Raises
@@ -230,6 +230,16 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
    ValueError
        `x` could not be expanded to have ndim dimensions.
+    Note
+    ----
+    We create a small cache of frequently used constant.
+    This speed up the Merge optimization for big graph.
+    We want to cache all scalar to don't merge as frequently constants.
+    But we don't want to cache too much stuff.
+    So we cache integer with dtype [u]int and float where the value is
+    between -10 and 10.
+    We cache all broadcast pattern for scalar.
    """
    x_ = scal.convert(x, dtype=dtype)
@@ -245,45 +255,29 @@ def constant_or_value(x, rtype, name=None, ndim=None, dtype=None):
        assert len(bcastable) == ndim
    try:
-        if rtype is TensorConstant:
+        ttype = TensorType(dtype=x_.dtype, broadcastable=bcastable)
-            rval = rtype(
+        if not constant.enable:
-                TensorType(dtype=x_.dtype, broadcastable=bcastable),
+            return TensorConstant(ttype, x_, name=name)
-                x_.copy(),
-                name=name)
+        sig = TensorConstantSignature((ttype, x_))
-            return rval
+        if sig in constant_cache:
-        else:
+            return constant_cache[sig]
-            # leave the shape out of the type
-            return rtype(TensorType(dtype=x_.dtype, broadcastable=bcastable),
+        ret = TensorConstant(ttype, x_, name=name)
-                         x_, name=name)
+        if (x_.size == 1 and
+            (-10) <= x_ <= 10 and
+            (x_.dtype in int_dtypes or x_.dtype in uint_dtypes or
+             (x_.dtype in float_dtypes and
+              # Limit the size of the cache.
+              len(constant_cache) < 10000))):
+            constant_cache[sig] = ret
+            # This is needed to raise a good error to the user.
+            ret.cached = True
+        return ret
    except Exception:
        raise TypeError("Could not convert %s to TensorType" % x, type(x))
-def constant(x, name=None, ndim=None, dtype=None):
-    ret = constant_or_value(x, rtype=TensorConstant, name=name, ndim=ndim,
-                            dtype=dtype)
-    # We create a small cache of frequently used constant.
-    # This speed up the Merge optimization for big graph.
-    # We want to cache all scalar to don't merge as frequently constants.
-    # But we don't want to cache too much stuff
-    # So we cache integer with dtype [u]int and float where the value is
-    # between -10 and 10
-    # We want to cache all broadcast pattern for scalar.
-    if not constant.enable:
-        return ret
-    sig = ret.signature()
-    if (sig not in constant_cache and ret.data.size == 1 and
-        (-10) <= ret.data <= 10 and
-        (ret.dtype in int_dtypes or ret.dtype in uint_dtypes or
-         (ret.dtype in float_dtypes and
-          # Limit the size of the cache.
-          len(constant_cache) < 10000))):
-        constant_cache[sig] = ret
-        # This is needed to raise a good error to the user.
-        ret.cached = True
-    return constant_cache.get(sig, ret)
 constant.enable = True
 constant_cache = {}

--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -3251,12 +3251,15 @@ def local_IncSubtensor_serialize(node):
        if movable_inputs:
            new_inputs = ([i for i in node.inputs if not movable(i)] +
                          [mi.owner.inputs[0] for mi in movable_inputs])
-            new_add = T.add(*new_inputs)
+            if len(new_inputs) == 0:
+                new_add = new_inputs[0]
+            else:
+                new_add = T.add(*new_inputs)
-            # Copy over stacktrace from original output, as an error
+                # Copy over stacktrace from original output, as an error
-            # (e.g. an index error) in this add operation should
+                # (e.g. an index error) in this add operation should
-            # correspond to an error in the original add operation.
+                # correspond to an error in the original add operation.
-            copy_stack_trace(node.outputs[0], new_add)
+                copy_stack_trace(node.outputs[0], new_add)
            # stack up the new incsubtensors
            tip = new_add