提交 67a0ceb6 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5455 from Saizheng/master

5055:Port ProfileMode GPU to new back-end and profiling.py
...@@ -18,6 +18,7 @@ import sys ...@@ -18,6 +18,7 @@ import sys
import time import time
from collections import defaultdict from collections import defaultdict
from six import iteritems from six import iteritems
import warnings
import numpy as np import numpy as np
...@@ -102,7 +103,6 @@ def _atexit_print_fn(): ...@@ -102,7 +103,6 @@ def _atexit_print_fn():
assert len(merge) == len(cum.optimizer_profile[1]) assert len(merge) == len(cum.optimizer_profile[1])
cum.optimizer_profile = (cum.optimizer_profile[0], merge) cum.optimizer_profile = (cum.optimizer_profile[0], merge)
except Exception as e: except Exception as e:
print("Got an exception while merging profile")
print(e) print(e)
cum.optimizer_profile = None cum.optimizer_profile = None
else: else:
...@@ -147,6 +147,14 @@ def print_global_stats(): ...@@ -147,6 +147,14 @@ def print_global_stats():
print('=' * 50, file=destination_file) print('=' * 50, file=destination_file)
_profiler_printers = []
def register_profiler_printer(fct):
_profiler_printers.append(fct)
return fct
class ProfileStats(object): class ProfileStats(object):
""" """
...@@ -172,7 +180,7 @@ class ProfileStats(object): ...@@ -172,7 +180,7 @@ class ProfileStats(object):
self.apply_time = {} self.apply_time = {}
self.apply_callcount = {} self.apply_callcount = {}
# self.apply_cimpl = None # self.apply_cimpl = None
# self.messge = None # self.message = None
# #
# Note on implementation: # Note on implementation:
# Class variables are used here so that each one can be # Class variables are used here so that each one can be
...@@ -271,10 +279,10 @@ class ProfileStats(object): ...@@ -271,10 +279,10 @@ class ProfileStats(object):
hasattr(theano, 'gpuarray') and hasattr(theano, 'gpuarray') and
theano.gpuarray.pygpu_activated and theano.gpuarray.pygpu_activated and
not config.profiling.ignore_first_call): not config.profiling.ignore_first_call):
logger.warn( warnings.warn(
"Theano flag profiling.ignore_first_call is False." "Theano flag profiling.ignore_first_call is False."
" This cause bad profiling result in the new gpu" " This cause bad profiling result in the new gpu"
" back-end, we as sometimes we compile at the first call.") " back-end, as sometimes we compile at the first call.")
self.apply_callcount = {} self.apply_callcount = {}
self.output_size = {} self.output_size = {}
...@@ -1318,6 +1326,7 @@ class ProfileStats(object): ...@@ -1318,6 +1326,7 @@ class ProfileStats(object):
print("-----------------", file=file) print("-----------------", file=file)
self.optimizer_profile[0].print_profile(file, self.optimizer_profile[0].print_profile(file,
self.optimizer_profile[1]) self.optimizer_profile[1])
self.print_extra(file)
self.print_tips(file) self.print_tips(file)
def print_tips(self, file): def print_tips(self, file):
...@@ -1464,6 +1473,12 @@ class ProfileStats(object): ...@@ -1464,6 +1473,12 @@ class ProfileStats(object):
if not printed_tip: if not printed_tip:
print(" Sorry, no tip for today.", file=file) print(" Sorry, no tip for today.", file=file)
def print_extra(self, file):
params = [self.message, self.compile_time, self.fct_call_time,
self.apply_time, self.apply_cimpl, self.output_size]
for f in _profiler_printers:
f(*params, file=file)
class ScanProfileStats(ProfileStats): class ScanProfileStats(ProfileStats):
callcount = 0.0 callcount = 0.0
......
...@@ -4,6 +4,7 @@ import copy ...@@ -4,6 +4,7 @@ import copy
import re import re
import numpy as np import numpy as np
import theano
from theano import Op, Apply, Type, Variable from theano import Op, Apply, Type, Variable
from theano import tensor, config from theano import tensor, config
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
...@@ -17,6 +18,7 @@ from collections import deque ...@@ -17,6 +18,7 @@ from collections import deque
from six import string_types, iterbytes from six import string_types, iterbytes
from six.moves import xrange from six.moves import xrange
from six import iteritems
try: try:
import pygpu import pygpu
...@@ -1538,6 +1540,62 @@ class GpuSplit(HideC, Split): ...@@ -1538,6 +1540,62 @@ class GpuSplit(HideC, Split):
return main_code % locals() return main_code % locals()
@theano.compile.profiling.register_profiler_printer
def profile_printer(message, compile_time, fct_call_time,
apply_time, apply_cimpl, outputs_size, file):
if any([x.op.__class__.__name__.lower().startswith("gpu")
for x in apply_time.keys()]):
local_time = sum(apply_time.values())
print('', file=file)
print('Some info useful for gpu:', file=file)
fgraphs = set()
for node in apply_time.keys():
fgraphs.add(node.fgraph)
cpu = 0
gpu = 0
trans = 0
for node, t in iteritems(apply_time):
if isinstance(node.op, (HostFromGpu, GpuFromHost)):
trans += t
elif node.op.__class__.__name__.lower().startswith("gpu"):
gpu += t
else:
cpu += t
print('', file=file)
print(" Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op" % (
cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
trans, trans / local_time * 100), file=file)
print('', file=file)
print(" Theano function input that are float64", file=file)
print(" <fct name> <input name> <input type> <str input>", file=file)
for fg in fgraphs:
for i in fg.inputs:
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print(' ', fg.name, i.name, i.type, i, file=file)
print('', file=file)
print(" List of apply that don't have float64 as input but have float64 in outputs", file=file)
print(" (Useful to know if we forgot some cast when using floatX=float32 or gpu code)", file=file)
print(' <Apply> <Apply position> <fct name> <inputs type> <outputs type>', file=file)
for fg in fgraphs:
for idx, node in enumerate(fg.toposort()):
if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.outputs) and
not any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.inputs)):
print(' ', str(node), idx, fg.name, end=' ',
file=file)
print(str([getattr(i, 'dtype', None)
for i in node.inputs]), end=' ', file=file)
print(str([getattr(i, 'dtype', None)
for i in node.outputs]), file=file)
print('', file=file)
class GpuEye(GpuKernelBase, Op): class GpuEye(GpuKernelBase, Op):
""" """
Eye for GPU. Eye for GPU.
......
...@@ -4,7 +4,6 @@ import logging ...@@ -4,7 +4,6 @@ import logging
import sys import sys
import warnings import warnings
import numpy import numpy
from six import iteritems
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
import theano import theano
...@@ -4092,59 +4091,6 @@ def tensor4(name=None, dtype=None): ...@@ -4092,59 +4091,6 @@ def tensor4(name=None, dtype=None):
ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4) ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4)
# TODO: move that to the new back-end and new profiling.py print_tips
# @theano.compile.profilemode.register_profiler_printer
def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
other_time):
if any([x[1].op.__class__.__name__.lower().startswith("gpu")
for x in apply_time.keys()]):
local_time = sum(apply_time.values())
print()
print('Some info useful for gpu:')
cpu = 0
gpu = 0
trans = 0
for (_, node), t in iteritems(apply_time):
if isinstance(node.op.__class__.__name__,
(HostFromGpu, GpuFromHost)):
trans += t
elif node.op.__class__.__name__.lower().startswith("gpu"):
gpu += t
else:
cpu += t
print()
print(" Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
trans, trans / local_time * 100))
print()
print(" Theano function input that are float64")
print(" <fct name> <input name> <input type> <str input>")
for fct in fct_call:
for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print(' ', fct.name, i.name, i.type, i)
print()
print(" List of apply that don't have float64 as input but have float64 in outputs")
print(" (Useful to know if we forgot some cast when using floatX=float32 or gpu code)")
print(' <Apply> <Apply position> <fct name> <inputs type> <outputs type>')
for fct in fct_call:
for idx, node in enumerate(fct.maker.fgraph.toposort()):
if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.outputs) and
not any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.inputs)):
print(' ', str(node), idx, fct.name, end=' ')
print(str([getattr(i, 'dtype', None)
for i in node.inputs]), end=' ')
print(str([getattr(i, 'dtype', None)
for i in node.outputs]))
class GpuEye(GpuOp): class GpuEye(GpuOp):
def __init__(self, dtype=None): def __init__(self, dtype=None):
......
...@@ -2867,42 +2867,41 @@ class Scan(PureOp): ...@@ -2867,42 +2867,41 @@ class Scan(PureOp):
gof.ops_with_inner_function[Scan] = 'fn' gof.ops_with_inner_function[Scan] = 'fn'
# TODO: move that to the new back-end and new profiling.py print_tips @theano.compile.profiling.register_profiler_printer
# @theano.compile.profilemode.register_profiler_printer def profile_printer(message, compile_time, fct_call_time,
def profile_printer(fct_name, compile_time, fct_call_time, fct_call, apply_time, apply_cimpl, outputs_size, file):
apply_time, apply_cimpl, message, outputs_size,
other_time):
# Scan overhead profile # Scan overhead profile
if any([isinstance(node.op, Scan) and v > 0 for (_, node), v in if any([isinstance(node.op, Scan) and v > 0 for node, v in
apply_time.items()]): apply_time.items()]):
print() print('', file=file)
print('Scan overhead:') print('Scan overhead:', file=file)
print('<Scan op time(s)> <sub scan fct time(s)> <sub scan op ' print('<Scan op time(s)> <sub scan fct time(s)> <sub scan op '
'time(s)> <sub scan fct time(% scan op time)> <sub scan ' 'time(s)> <sub scan fct time(% scan op time)> <sub scan '
'op time(% scan op time)> <node>') 'op time(% scan op time)> <node>', file=file)
total_super_scan_time = 0 total_super_scan_time = 0
total_scan_fct_time = 0 total_scan_fct_time = 0
total_scan_op_time = 0 total_scan_op_time = 0
for (_, node), v in iteritems(apply_time): for node, v in iteritems(apply_time):
if isinstance(node.op, Scan): if isinstance(node.op, Scan) and node.op.fn.profile:
if v > 0: if v > 0:
scan_fct_time = node.op.mode_instance.fn_time scan_fct_time = node.op.fn.profile.call_time
scan_op_time = node.op.mode_instance.local_time scan_op_time = sum(node.op.fn.profile.apply_time.values())
total_super_scan_time += v total_super_scan_time += v
total_scan_fct_time += scan_fct_time total_scan_fct_time += scan_fct_time
total_scan_op_time += scan_op_time total_scan_op_time += scan_op_time
print(' %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % ( print(' %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % (
v, v,
scan_fct_time, scan_fct_time,
scan_op_time, scan_op_time,
scan_fct_time / v * 100, scan_fct_time / v * 100,
scan_op_time / v * 100), node) scan_op_time / v * 100), node, file=file)
else: else:
print((' The node took 0s, so we can not ' print((' The node took 0s, so we can not '
'compute the overhead'), node) 'compute the overhead'), node, file=file)
print(' total %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % ( print('total %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % (
total_super_scan_time, total_super_scan_time,
total_scan_fct_time, total_scan_fct_time,
total_scan_op_time, total_scan_op_time,
total_scan_fct_time / total_super_scan_time * 100, total_scan_fct_time / total_super_scan_time * 100,
total_scan_op_time / total_super_scan_time * 100)) total_scan_op_time / total_super_scan_time * 100), file=file)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论