Merge pull request #5455 from Saizheng/master

5055:Port ProfileMode GPU to new back-end and profiling.py

Merge pull request #5455 from Saizheng/master
67a0ceb6 · Frédéric Bastien · GitHub · 83cc7ef3 · 422ebb8a · 67a0ceb6
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -18,6 +18,7 @@ import sys
 import time
 from collections import defaultdict
 from six import iteritems
+import warnings
 import numpy as np
@@ -102,7 +103,6 @@ def _atexit_print_fn():
                        assert len(merge) == len(cum.optimizer_profile[1])
                        cum.optimizer_profile = (cum.optimizer_profile[0], merge)
                    except Exception as e:
-                        print("Got an exception while merging profile")
                        print(e)
                        cum.optimizer_profile = None
                else:
@@ -147,6 +147,14 @@ def print_global_stats():
    print('=' * 50, file=destination_file)
+_profiler_printers = []
+def register_profiler_printer(fct):
+    _profiler_printers.append(fct)
+    return fct
 class ProfileStats(object):
    """
@@ -172,7 +180,7 @@ class ProfileStats(object):
        self.apply_time = {}
        self.apply_callcount = {}
        # self.apply_cimpl = None
-        # self.messge = None
+        # self.message = None
    #
    # Note on implementation:
    # Class variables are used here so that each one can be
@@ -271,10 +279,10 @@ class ProfileStats(object):
                hasattr(theano, 'gpuarray') and
                theano.gpuarray.pygpu_activated and
                not config.profiling.ignore_first_call):
-            logger.warn(
+            warnings.warn(
                "Theano flag profiling.ignore_first_call is False."
                " This cause bad profiling result in the new gpu"
-                " back-end, we as sometimes we compile at the first call.")
+                " back-end, as sometimes we compile at the first call.")
        self.apply_callcount = {}
        self.output_size = {}
@@ -1318,6 +1326,7 @@ class ProfileStats(object):
            print("-----------------", file=file)
            self.optimizer_profile[0].print_profile(file,
                                                    self.optimizer_profile[1])
+        self.print_extra(file)
        self.print_tips(file)
    def print_tips(self, file):
@@ -1464,6 +1473,12 @@ class ProfileStats(object):
        if not printed_tip:
            print("  Sorry, no tip for today.", file=file)
+    def print_extra(self, file):
+        params = [self.message, self.compile_time, self.fct_call_time,
+                  self.apply_time, self.apply_cimpl, self.output_size]
+        for f in _profiler_printers:
+            f(*params, file=file)
 class ScanProfileStats(ProfileStats):
    callcount = 0.0

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -4,6 +4,7 @@ import copy
 import re
 import numpy as np
+import theano
 from theano import Op, Apply, Type, Variable
 from theano import tensor, config
 from theano.gradient import grad_undefined
@@ -17,6 +18,7 @@ from collections import deque
 from six import string_types, iterbytes
 from six.moves import xrange
+from six import iteritems
 try:
    import pygpu
@@ -1538,6 +1540,62 @@ class GpuSplit(HideC, Split):
        return main_code % locals()
+@theano.compile.profiling.register_profiler_printer
+def profile_printer(message, compile_time, fct_call_time,
+                    apply_time, apply_cimpl, outputs_size, file):
+    if any([x.op.__class__.__name__.lower().startswith("gpu")
+            for x in apply_time.keys()]):
+        local_time = sum(apply_time.values())
+        print('', file=file)
+        print('Some info useful for gpu:', file=file)
+        fgraphs = set()
+        for node in apply_time.keys():
+            fgraphs.add(node.fgraph)
+        cpu = 0
+        gpu = 0
+        trans = 0
+        for node, t in iteritems(apply_time):
+            if isinstance(node.op, (HostFromGpu, GpuFromHost)):
+                trans += t
+            elif node.op.__class__.__name__.lower().startswith("gpu"):
+                gpu += t
+            else:
+                cpu += t
+        print('', file=file)
+        print("    Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op" % (
+            cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
+            trans, trans / local_time * 100), file=file)
+        print('', file=file)
+        print("    Theano function input that are float64", file=file)
+        print("    <fct name> <input name> <input type> <str input>", file=file)
+        for fg in fgraphs:
+            for i in fg.inputs:
+                if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
+                    print('        ', fg.name, i.name, i.type, i, file=file)
+        print('', file=file)
+        print("    List of apply that don't have float64 as input but have float64 in outputs", file=file)
+        print("    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)", file=file)
+        print('    <Apply> <Apply position> <fct name> <inputs type> <outputs type>', file=file)
+        for fg in fgraphs:
+            for idx, node in enumerate(fg.toposort()):
+                if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
+                        for i in node.outputs) and
+                    not any(hasattr(i, 'dtype') and i.dtype == 'float64'
+                            for i in node.inputs)):
+                    print('        ', str(node), idx, fg.name, end=' ',
+                          file=file)
+                    print(str([getattr(i, 'dtype', None)
+                               for i in node.inputs]), end=' ', file=file)
+                    print(str([getattr(i, 'dtype', None)
+                               for i in node.outputs]), file=file)
+        print('', file=file)
 class GpuEye(GpuKernelBase, Op):
    """
    Eye for GPU.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -4,7 +4,6 @@ import logging
 import sys
 import warnings
 import numpy
-from six import iteritems
 from six.moves import StringIO, xrange
 import theano
@@ -4092,59 +4091,6 @@ def tensor4(name=None, dtype=None):
 ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4)
-# TODO: move that to the new back-end and new profiling.py print_tips
-# @theano.compile.profilemode.register_profiler_printer
-def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
-                    apply_time, apply_cimpl, message, outputs_size,
-                    other_time):
-    if any([x[1].op.__class__.__name__.lower().startswith("gpu")
-            for x in apply_time.keys()]):
-        local_time = sum(apply_time.values())
-        print()
-        print('Some info useful for gpu:')
-        cpu = 0
-        gpu = 0
-        trans = 0
-        for (_, node), t in iteritems(apply_time):
-            if isinstance(node.op.__class__.__name__,
-                          (HostFromGpu, GpuFromHost)):
-                trans += t
-            elif node.op.__class__.__name__.lower().startswith("gpu"):
-                gpu += t
-            else:
-                cpu += t
-        print()
-        print("    Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
-            cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
-            trans, trans / local_time * 100))
-        print()
-        print("    Theano function input that are float64")
-        print("    <fct name> <input name> <input type> <str input>")
-        for fct in fct_call:
-            for i in fct.input_storage:
-                if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
-                    print('        ', fct.name, i.name, i.type, i)
-        print()
-        print("    List of apply that don't have float64 as input but have float64 in outputs")
-        print("    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)")
-        print('    <Apply> <Apply position> <fct name> <inputs type> <outputs type>')
-        for fct in fct_call:
-            for idx, node in enumerate(fct.maker.fgraph.toposort()):
-                if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
-                        for i in node.outputs) and
-                    not any(hasattr(i, 'dtype') and i.dtype == 'float64'
-                            for i in node.inputs)):
-                    print('        ', str(node), idx, fct.name, end=' ')
-                    print(str([getattr(i, 'dtype', None)
-                               for i in node.inputs]), end=' ')
-                    print(str([getattr(i, 'dtype', None)
-                               for i in node.outputs]))
 class GpuEye(GpuOp):
    def __init__(self, dtype=None):

--- a/theano/scan_module/scan_op.py
+++ b/theano/scan_module/scan_op.py
@@ -2867,42 +2867,41 @@ class Scan(PureOp):
 gof.ops_with_inner_function[Scan] = 'fn'
-# TODO: move that to the new back-end and new profiling.py print_tips
+@theano.compile.profiling.register_profiler_printer
-# @theano.compile.profilemode.register_profiler_printer
+def profile_printer(message, compile_time, fct_call_time,
-def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
+                    apply_time, apply_cimpl, outputs_size, file):
-                    apply_time, apply_cimpl, message, outputs_size,
-                    other_time):
    # Scan overhead profile
-    if any([isinstance(node.op, Scan) and v > 0 for (_, node), v in
+    if any([isinstance(node.op, Scan) and v > 0 for node, v in
            apply_time.items()]):
-        print()
+        print('', file=file)
-        print('Scan overhead:')
+        print('Scan overhead:', file=file)
        print('<Scan op time(s)> <sub scan fct time(s)> <sub scan op '
              'time(s)> <sub scan fct time(% scan op time)> <sub scan '
-              'op time(% scan op time)> <node>')
+              'op time(% scan op time)> <node>', file=file)
        total_super_scan_time = 0
        total_scan_fct_time = 0
        total_scan_op_time = 0
-        for (_, node), v in iteritems(apply_time):
+        for node, v in iteritems(apply_time):
-            if isinstance(node.op, Scan):
+            if isinstance(node.op, Scan) and node.op.fn.profile:
                if v > 0:
-                    scan_fct_time = node.op.mode_instance.fn_time
+                    scan_fct_time = node.op.fn.profile.call_time
-                    scan_op_time = node.op.mode_instance.local_time
+                    scan_op_time = sum(node.op.fn.profile.apply_time.values())
                    total_super_scan_time += v
                    total_scan_fct_time += scan_fct_time
                    total_scan_op_time += scan_op_time
-                    print('    %5.1fs  %5.1fs  %5.1fs  %5.1f%%  %5.1f%%' % (
+                    print('      %5.1fs  %5.1fs  %5.1fs  %5.1f%%  %5.1f%%' % (
                        v,
                        scan_fct_time,
                        scan_op_time,
                        scan_fct_time / v * 100,
-                        scan_op_time / v * 100), node)
+                        scan_op_time / v * 100), node, file=file)
                else:
                    print((' The node took 0s, so we can not '
-                           'compute the overhead'), node)
+                           'compute the overhead'), node, file=file)
-        print('    total %5.1fs  %5.1fs  %5.1fs  %5.1f%%  %5.1f%%' % (
+        print('total %5.1fs  %5.1fs  %5.1fs  %5.1f%%  %5.1f%%' % (
            total_super_scan_time,
            total_scan_fct_time,
            total_scan_op_time,
            total_scan_fct_time / total_super_scan_time * 100,
-            total_scan_op_time / total_super_scan_time * 100))
+            total_scan_op_time / total_super_scan_time * 100), file=file)