5055:Port ProfileMode GPU to new back-end and profiling.py

81211317 · Saizheng Zhang · acd4a90f · 81211317 · 81211317 · 81211317
--- a/theano/compile/profiling.py
+++ b/theano/compile/profiling.py
@@ -147,6 +147,12 @@ def print_global_stats():
    print('=' * 50, file=destination_file)
+_profiler_printers = []
+def register_profiler_printer(fct):
+    profiler_printers.append(fct)
+    return fct
 class ProfileStats(object):
    """

--- a/theano/gpuarray/basic_ops.py
+++ b/theano/gpuarray/basic_ops.py
@@ -1538,6 +1538,58 @@ class GpuSplit(HideC, Split):
        return main_code % locals()
+@theano.compile.profiling.register_profiler_printer
+def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
+                    apply_time, apply_cimpl, message, outputs_size,
+                    other_time):
+    if any([x[1].op.__class__.__name__.lower().startswith("gpu")
+            for x in apply_time.keys()]):
+        local_time = sum(apply_time.values())
+        print()
+        print('Some info useful for gpu:')
+        cpu = 0
+        gpu = 0
+        trans = 0
+        for (_, node), t in iteritems(apply_time):
+            if isinstance(node.op.__class__.__name__,
+                          (HostFromGpu, GpuFromHost)):
+                trans += t
+            elif node.op.__class__.__name__.lower().startswith("gpu"):
+                gpu += t
+            else:
+                cpu += t
+        print()
+        print("    Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
+            cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
+            trans, trans / local_time * 100))
+        print()
+        print("    Theano function input that are float64")
+        print("    <fct name> <input name> <input type> <str input>")
+        for fct in fct_call:
+            for i in fct.input_storage:
+                if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
+                    print('        ', fct.name, i.name, i.type, i)
+        print()
+        print("    List of apply that don't have float64 as input but have float64 in outputs")
+        print("    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)")
+        print('    <Apply> <Apply position> <fct name> <inputs type> <outputs type>')
+        for fct in fct_call:
+            for idx, node in enumerate(fct.maker.fgraph.toposort()):
+                if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
+                        for i in node.outputs) and
+                    not any(hasattr(i, 'dtype') and i.dtype == 'float64'
+                            for i in node.inputs)):
+                    print('        ', str(node), idx, fct.name, end=' ')
+                    print(str([getattr(i, 'dtype', None)
+                               for i in node.inputs]), end=' ')
+                    print(str([getattr(i, 'dtype', None)
+                               for i in node.outputs]))
 class GpuEye(GpuKernelBase, Op):
    """
    Eye for GPU.

--- a/theano/sandbox/cuda/basic_ops.py
+++ b/theano/sandbox/cuda/basic_ops.py
@@ -4092,59 +4092,6 @@ def tensor4(name=None, dtype=None):
 ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4)
-# TODO: move that to the new back-end and new profiling.py print_tips
-# @theano.compile.profilemode.register_profiler_printer
-def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
-                    apply_time, apply_cimpl, message, outputs_size,
-                    other_time):
-    if any([x[1].op.__class__.__name__.lower().startswith("gpu")
-            for x in apply_time.keys()]):
-        local_time = sum(apply_time.values())
-        print()
-        print('Some info useful for gpu:')
-        cpu = 0
-        gpu = 0
-        trans = 0
-        for (_, node), t in iteritems(apply_time):
-            if isinstance(node.op.__class__.__name__,
-                          (HostFromGpu, GpuFromHost)):
-                trans += t
-            elif node.op.__class__.__name__.lower().startswith("gpu"):
-                gpu += t
-            else:
-                cpu += t
-        print()
-        print("    Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
-            cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
-            trans, trans / local_time * 100))
-        print()
-        print("    Theano function input that are float64")
-        print("    <fct name> <input name> <input type> <str input>")
-        for fct in fct_call:
-            for i in fct.input_storage:
-                if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
-                    print('        ', fct.name, i.name, i.type, i)
-        print()
-        print("    List of apply that don't have float64 as input but have float64 in outputs")
-        print("    (Useful to know if we forgot some cast when using floatX=float32 or gpu code)")
-        print('    <Apply> <Apply position> <fct name> <inputs type> <outputs type>')
-        for fct in fct_call:
-            for idx, node in enumerate(fct.maker.fgraph.toposort()):
-                if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
-                        for i in node.outputs) and
-                    not any(hasattr(i, 'dtype') and i.dtype == 'float64'
-                            for i in node.inputs)):
-                    print('        ', str(node), idx, fct.name, end=' ')
-                    print(str([getattr(i, 'dtype', None)
-                               for i in node.inputs]), end=' ')
-                    print(str([getattr(i, 'dtype', None)
-                               for i in node.outputs]))
 class GpuEye(GpuOp):
    def __init__(self, dtype=None):