Merge pull request #5037 from nouiz/inplace_profile

Inplace profile and profile merge crash fix.

Merge pull request #5037 from nouiz/inplace_profile
53ba24bb · abergeron · GitHub · a116149c · d20b4b45 · 53ba24bb
--- a/theano/gof/opt.py
+++ b/theano/gof/opt.py
@@ -2623,7 +2623,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            final_optimizers=final_optimizers,
            cleanup_optimizers=cleanup_optimizers)

-        def merge_list(l1, l2):
+        def add_append_list(l1, l2):
            l = copy.copy(l1)
            for idx, nb in enumerate(l2):
                if idx < len(l):
@@ -2632,7 +2632,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
                    l.append(nb)
            return l

-        loop_timing = merge_list(prof1[1], prof2[1])
+        loop_timing = add_append_list(prof1[1], prof2[1])

        loop_process_count = list(prof1[2])
        global_sub_profs = []
@@ -2668,23 +2668,30 @@ class EquilibriumOptimizer(NavigatorOptimizer):
            final_sub_profs.append(merge(final_optimizers, 'final_optimizers', 10))
            cleanup_sub_profs.append(merge(cleanup_optimizers, 'cleanup_optimizers', 11))

-        loop_process_count.extend(prof2[2][len(loop_process_count):])
+        # Add the iteration done by only one of the profile.
+        loop_process_count.extend(prof1[2][len(loop_process_count):])
+        global_sub_profs.extend(prof1[9][len(global_sub_profs):])
+        final_sub_profs.extend(prof1[10][len(final_sub_profs):])
+        cleanup_sub_profs.extend(prof1[11][len(cleanup_sub_profs):])
+
+        global_sub_profs.extend(prof2[9][len(loop_process_count):])
+        final_sub_profs.extend(prof2[10][len(loop_process_count):])
+        cleanup_sub_profs.extend(prof2[11][len(loop_process_count):])

        max_nb_nodes = max(prof1[3], prof2[3])

-        global_opt_timing = merge_list(prof1[4], prof2[4])
+        global_opt_timing = add_append_list(prof1[4], prof2[4])

-        nb_nodes = merge_list(prof1[5], prof2[5])
+        nb_nodes = add_append_list(prof1[5], prof2[5])

        time_opts = merge_dict(prof1[6], prof2[6])
-        io_toposort_timing = merge_list(prof1[7], prof2[7])
-
+        io_toposort_timing = add_append_list(prof1[7], prof2[7])
        assert (len(loop_timing) == len(global_opt_timing) ==
+                len(global_sub_profs) ==
                len(io_toposort_timing) == len(nb_nodes))
        assert len(loop_timing) == max(len(prof1[1]), len(prof2[1]))

        node_created = merge_dict(prof1[8], prof2[8])
-
        return (new_opt,
                loop_timing,
                loop_process_count,

--- a/theano/gpuarray/opt.py
+++ b/theano/gpuarray/opt.py
@@ -743,7 +743,7 @@ optdb.register('gpua_elemwise_fusion',
               tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 49,
               'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')

-inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
+inplace_gpu_elemwise_opt = tensor.opt.InplaceElemwiseOptimizer(
    GpuElemwise)
 optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
               'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')

--- a/theano/printing.py
+++ b/theano/printing.py
@@ -482,7 +482,7 @@ class IgnorePrinter:
 class DefaultPrinter:

    def __init__(self):
-        pass
+        self.leaf_printer = LeafPrinter()

    def process(self, output, pstate):
        if output in pstate.memo:
@@ -490,7 +490,7 @@ class DefaultPrinter:
        pprinter = pstate.pprinter
        node = output.owner
        if node is None:
-            return LeafPrinter().process(output, pstate)
+            return self.leaf_printer.process(output, pstate)
        r = "%s(%s)" % (str(node.op), ", ".join(
            [pprinter.process(input, pstate.clone(precedence=-1000))
             for input in node.inputs]))
@@ -513,12 +513,13 @@ class LeafPrinter:
 class PPrinter:
    def __init__(self):
        self.printers = []
+        self.printers_dict = {}

    def assign(self, condition, printer):
-        if isinstance(condition, gof.Op):
-            op = condition
-            condition = (lambda pstate, r: r.owner is not None and
-                         r.owner.op == op)
+        # condition can be a class or an instance of an Op.
+        if isinstance(condition, (gof.Op, type)):
+            self.printers_dict[condition] = printer
+            return
        self.printers.insert(0, (condition, printer))

    def process(self, r, pstate=None):
@@ -526,6 +527,11 @@ class PPrinter:
            pstate = PrinterState(pprinter=self)
        elif isinstance(pstate, dict):
            pstate = PrinterState(pprinter=self, **pstate)
+        if getattr(r, 'owner', None) is not None:
+            if r.owner.op in self.printers_dict:
+                return self.printers_dict[r.owner.op].process(r, pstate)
+            if type(r.owner.op) in self.printers_dict:
+                return self.printers_dict[type(r.owner.op)].process(r, pstate)
        for condition, printer in self.printers:
            if condition(pstate, r):
                return printer.process(r, pstate)
@@ -533,6 +539,7 @@ class PPrinter:
    def clone(self):
        cp = copy(self)
        cp.printers = list(self.printers)
+        cp.printers_dict = dict(self.printers_dict)
        return cp

    def clone_assign(self, condition, printer):

--- a/theano/sandbox/cuda/opt.py
+++ b/theano/sandbox/cuda/opt.py
@@ -2181,7 +2181,7 @@ else:
                   71.00, 'fusion', 'local_elemwise_fusion')

 # GpuElemwise inplace
-gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
+gpu_inplace_elemwise_optimizer = tensor.opt.InplaceElemwiseOptimizer(
    GpuElemwise)
 # DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
 # It still will be run in fast_run with device=gpu with the current tag.

--- a/theano/tensor/basic.py
+++ b/theano/tensor/basic.py
@@ -4113,8 +4113,7 @@ class Join(Op):

 join = Join()

-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Join),
-              printing.FunctionPrinter('join'))
+pprint.assign(Join, printing.FunctionPrinter('join'))


 def roll(x, shift, axis=None):

--- a/theano/tensor/elemwise.py
+++ b/theano/tensor/elemwise.py
@@ -446,8 +446,7 @@ class DimShufflePrinter:
        else:
            raise TypeError("Can only print DimShuffle.")

-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, DimShuffle),
-              DimShufflePrinter())
+pprint.assign(DimShuffle, DimShufflePrinter())


 ################

--- a/theano/tensor/inplace.py
+++ b/theano/tensor/inplace.py
@@ -26,12 +26,7 @@ def _scal_inplace(symbol):
    rval.__epydoc_asRoutine = symbol
    rval.__module__ = 'theano.tensor.inplace'

-    def chk(pstate, r):
-        if not r.owner:
-            return False
-        return r.owner.op == rval
-
-    pprint.assign(chk, printing.FunctionPrinter(symbolname.replace('_inplace', '=')))
+    pprint.assign(rval, printing.FunctionPrinter(symbolname.replace('_inplace', '=')))
    return rval



--- a/theano/tensor/opt.py
+++ b/theano/tensor/opt.py
@@ -5,6 +5,7 @@ Tensor optimizations addressing the ops in basic.py.
 # TODO: intelligent merge for mul/add
 # TODO: 0*x -> 0

+from collections import defaultdict
 import logging
 import itertools
 import operator
@@ -146,14 +147,34 @@ def broadcast_like(value, template, fgraph, dtype=None):
    return rval


-def inplace_elemwise_optimizer_op(OP):
+class InplaceElemwiseOptimizer(Optimizer):
    """
    We parametrise it to make it work for Elemwise and GpuElemwise op.
    """
-    @gof.inplace_optimizer
-    def inplace_elemwise_optimizer(fgraph):
+    def __init__(self, OP):
+        self.op = OP
+
+    def add_requirements(self, fgraph):
+        fgraph.attach_feature(theano.gof.destroyhandler.DestroyHandler())
+
+    @staticmethod
+    def print_profile(stream, prof, level=0):
+        blanc = ('    ' * level)
+        print(blanc, "InplaceElemwiseOptimizer ", prof['opt'].op, file=stream)
+        for k in ['node_before',
+                  'nb_call_replace',
+                  'nb_call_validate',
+                  'nb_inconsistent']:
+            print(blanc, k, prof[k], file=stream)
+        ndim = prof['ndim']
+        if ndim:
+            print(blanc, "ndim", "nb", file=stream)
+            for n in sorted(ndim.keys()):
+                print(blanc, n, ndim[n], file=stream)
+
+    def apply(self, fgraph):
        """
-        Usage: inplace_elemwise_optimizer.optimize(fgraph)
+        Usage: InplaceElemwiseOptimizer(op).optimize(fgraph)

        Attempts to replace all Broadcast ops by versions of them
        that operate inplace. It operates greedily: for each Broadcast
@@ -163,8 +184,10 @@ def inplace_elemwise_optimizer_op(OP):

        Examples
        --------
-        x + y + z -> x += y += z
-        (x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)
+
+            `x + y + z -> x += y += z`
+
+            `(x + y) * (x * y) -> (x += y) *= (x * y) or (x + y) *= (x *= y)`

        """
        # We should not validate too often as this takes too much time to
@@ -187,6 +210,13 @@ def inplace_elemwise_optimizer_op(OP):
        # the solution is also applicable there.

        # We execute `validate` after this number of change.
+        prof = {'opt': self,
+                'node_before': len(fgraph.apply_nodes),
+                'nb_call_replace': 0,
+                'nb_call_validate': 0,
+                'nb_inconsistent': 0,
+                'ndim': defaultdict(lambda: 0)}
+
        check_each_change = config.tensor.insert_inplace_optimizer_validate_nb
        if check_each_change == -1:
            if len(fgraph.apply_nodes) > 500:
@@ -210,7 +240,7 @@ def inplace_elemwise_optimizer_op(OP):
        for node in list(graph.io_toposort(fgraph.inputs, fgraph.outputs)):
            op = node.op
            # gpuarray GpuElemwise inherit from Elemwise
-            if not type(op) == OP:
+            if not type(op) == self.op:
                continue
            # If big graph and the outputs are scalar, do not make it
            # inplace.
@@ -327,19 +357,23 @@ def inplace_elemwise_optimizer_op(OP):
                                scalar.transfer_type(
                                    *[inplace_pattern.get(i, None)
                                      for i in xrange(len(node.outputs))]))
-                        new_outputs = OP(new_scal, inplace_pattern)(
+                        new_outputs = self.op(new_scal, inplace_pattern)(
                            *node.inputs, **dict(return_list=True))
                        new_node = new_outputs[0].owner

                        for r, new_r in zip(node.outputs, new_outputs):
+                            prof['nb_call_replace'] += 1
                            fgraph.replace(r, new_r,
                                           reason="inplace_elemwise_optimizer")
                        nb_change_no_validate += 1
+                        prof['ndim'][candidate_out_var.ndim] += 1
                        if nb_change_no_validate >= check_each_change:
+                            prof['nb_call_validate'] += 1
                            fgraph.validate()
                            chk = fgraph.checkpoint()
                            nb_change_no_validate = 0
                    except (ValueError, InconsistencyError) as e:
+                        prof['nb_inconsistent'] += 1
                        if check_each_change != 1 and not raised_warning:
                            print(("Some inplace optimization was not "
                                   "performed due to unexpected error:"),
@@ -362,9 +396,14 @@ def inplace_elemwise_optimizer_op(OP):
                           "performed due to unexpected error"),
                          file=sys.stderr)
                fgraph.revert(chk)
-    return inplace_elemwise_optimizer
+        return prof
+
+    def print_summary(self, stream=sys.stdout, level=0, depth=-1):
+        print("%s%s (%s)" % (
+            (' ' * level), self.__class__.__name__, self.op), file=stream)
+        return inplace_elemwise_optimizer

-inplace_elemwise_optimizer = inplace_elemwise_optimizer_op(T.Elemwise)
+inplace_elemwise_optimizer = InplaceElemwiseOptimizer(T.Elemwise)
 compile.optdb.register('inplace_elemwise_opt', inplace_elemwise_optimizer, 75,
                       'inplace_opt',  # for historic reason
                       'inplace_elemwise_optimizer',
@@ -830,8 +869,7 @@ class MakeVectorPrinter:
        else:
            raise TypeError("Can only print make_vector.")

-T.pprint.assign(lambda pstate, r: r.owner and
-                isinstance(r.owner.op, MakeVector), MakeVectorPrinter())
+T.pprint.assign(MakeVector, MakeVectorPrinter())


 class ShapeFeature(object):

--- a/theano/tensor/subtensor.py
+++ b/theano/tensor/subtensor.py
@@ -1002,8 +1002,7 @@ class SubtensorPrinter:
        else:
            raise TypeError("Can only print Subtensor.")

-pprint.assign(lambda pstate, r: r.owner and isinstance(r.owner.op, Subtensor),
-              SubtensorPrinter())
+pprint.assign(Subtensor, SubtensorPrinter())


 def set_subtensor(x, y, inplace=False,