提交 67a0ceb6 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5455 from Saizheng/master

5055:Port ProfileMode GPU to new back-end and profiling.py
......@@ -18,6 +18,7 @@ import sys
import time
from collections import defaultdict
from six import iteritems
import warnings
import numpy as np
......@@ -102,7 +103,6 @@ def _atexit_print_fn():
assert len(merge) == len(cum.optimizer_profile[1])
cum.optimizer_profile = (cum.optimizer_profile[0], merge)
except Exception as e:
print("Got an exception while merging profile")
print(e)
cum.optimizer_profile = None
else:
......@@ -147,6 +147,14 @@ def print_global_stats():
print('=' * 50, file=destination_file)
_profiler_printers = []
def register_profiler_printer(fct):
_profiler_printers.append(fct)
return fct
class ProfileStats(object):
"""
......@@ -172,7 +180,7 @@ class ProfileStats(object):
self.apply_time = {}
self.apply_callcount = {}
# self.apply_cimpl = None
# self.messge = None
# self.message = None
#
# Note on implementation:
# Class variables are used here so that each one can be
......@@ -271,10 +279,10 @@ class ProfileStats(object):
hasattr(theano, 'gpuarray') and
theano.gpuarray.pygpu_activated and
not config.profiling.ignore_first_call):
logger.warn(
warnings.warn(
"Theano flag profiling.ignore_first_call is False."
" This cause bad profiling result in the new gpu"
" back-end, we as sometimes we compile at the first call.")
" back-end, as sometimes we compile at the first call.")
self.apply_callcount = {}
self.output_size = {}
......@@ -1318,6 +1326,7 @@ class ProfileStats(object):
print("-----------------", file=file)
self.optimizer_profile[0].print_profile(file,
self.optimizer_profile[1])
self.print_extra(file)
self.print_tips(file)
def print_tips(self, file):
......@@ -1464,6 +1473,12 @@ class ProfileStats(object):
if not printed_tip:
print(" Sorry, no tip for today.", file=file)
def print_extra(self, file):
params = [self.message, self.compile_time, self.fct_call_time,
self.apply_time, self.apply_cimpl, self.output_size]
for f in _profiler_printers:
f(*params, file=file)
class ScanProfileStats(ProfileStats):
callcount = 0.0
......
......@@ -4,6 +4,7 @@ import copy
import re
import numpy as np
import theano
from theano import Op, Apply, Type, Variable
from theano import tensor, config
from theano.gradient import grad_undefined
......@@ -17,6 +18,7 @@ from collections import deque
from six import string_types, iterbytes
from six.moves import xrange
from six import iteritems
try:
import pygpu
......@@ -1538,6 +1540,62 @@ class GpuSplit(HideC, Split):
return main_code % locals()
@theano.compile.profiling.register_profiler_printer
def profile_printer(message, compile_time, fct_call_time,
apply_time, apply_cimpl, outputs_size, file):
if any([x.op.__class__.__name__.lower().startswith("gpu")
for x in apply_time.keys()]):
local_time = sum(apply_time.values())
print('', file=file)
print('Some info useful for gpu:', file=file)
fgraphs = set()
for node in apply_time.keys():
fgraphs.add(node.fgraph)
cpu = 0
gpu = 0
trans = 0
for node, t in iteritems(apply_time):
if isinstance(node.op, (HostFromGpu, GpuFromHost)):
trans += t
elif node.op.__class__.__name__.lower().startswith("gpu"):
gpu += t
else:
cpu += t
print('', file=file)
print(" Spent %.3fs(%.2f%%) in cpu Op, %.3fs(%.2f%%) in gpu Op and %.3fs(%.2f%%) transfert Op" % (
cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
trans, trans / local_time * 100), file=file)
print('', file=file)
print(" Theano function input that are float64", file=file)
print(" <fct name> <input name> <input type> <str input>", file=file)
for fg in fgraphs:
for i in fg.inputs:
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print(' ', fg.name, i.name, i.type, i, file=file)
print('', file=file)
print(" List of apply that don't have float64 as input but have float64 in outputs", file=file)
print(" (Useful to know if we forgot some cast when using floatX=float32 or gpu code)", file=file)
print(' <Apply> <Apply position> <fct name> <inputs type> <outputs type>', file=file)
for fg in fgraphs:
for idx, node in enumerate(fg.toposort()):
if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.outputs) and
not any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.inputs)):
print(' ', str(node), idx, fg.name, end=' ',
file=file)
print(str([getattr(i, 'dtype', None)
for i in node.inputs]), end=' ', file=file)
print(str([getattr(i, 'dtype', None)
for i in node.outputs]), file=file)
print('', file=file)
class GpuEye(GpuKernelBase, Op):
"""
Eye for GPU.
......
......@@ -4,7 +4,6 @@ import logging
import sys
import warnings
import numpy
from six import iteritems
from six.moves import StringIO, xrange
import theano
......@@ -4092,59 +4091,6 @@ def tensor4(name=None, dtype=None):
ftensor4 = CudaNdarrayType(dtype='float32', broadcastable=(False,) * 4)
# TODO: move that to the new back-end and new profiling.py print_tips
# @theano.compile.profilemode.register_profiler_printer
def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
other_time):
if any([x[1].op.__class__.__name__.lower().startswith("gpu")
for x in apply_time.keys()]):
local_time = sum(apply_time.values())
print()
print('Some info useful for gpu:')
cpu = 0
gpu = 0
trans = 0
for (_, node), t in iteritems(apply_time):
if isinstance(node.op.__class__.__name__,
(HostFromGpu, GpuFromHost)):
trans += t
elif node.op.__class__.__name__.lower().startswith("gpu"):
gpu += t
else:
cpu += t
print()
print(" Spent %.3fs(%.3f%%) in cpu Op, %.3fs(%.3f%%) in gpu Op and %.3fs(%.3f%%) transfert Op" % (
cpu, cpu / local_time * 100, gpu, gpu / local_time * 100,
trans, trans / local_time * 100))
print()
print(" Theano function input that are float64")
print(" <fct name> <input name> <input type> <str input>")
for fct in fct_call:
for i in fct.input_storage:
if hasattr(i.type, 'dtype') and i.type.dtype == 'float64':
print(' ', fct.name, i.name, i.type, i)
print()
print(" List of apply that don't have float64 as input but have float64 in outputs")
print(" (Useful to know if we forgot some cast when using floatX=float32 or gpu code)")
print(' <Apply> <Apply position> <fct name> <inputs type> <outputs type>')
for fct in fct_call:
for idx, node in enumerate(fct.maker.fgraph.toposort()):
if (any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.outputs) and
not any(hasattr(i, 'dtype') and i.dtype == 'float64'
for i in node.inputs)):
print(' ', str(node), idx, fct.name, end=' ')
print(str([getattr(i, 'dtype', None)
for i in node.inputs]), end=' ')
print(str([getattr(i, 'dtype', None)
for i in node.outputs]))
class GpuEye(GpuOp):
def __init__(self, dtype=None):
......
......@@ -2867,27 +2867,26 @@ class Scan(PureOp):
gof.ops_with_inner_function[Scan] = 'fn'
# TODO: move that to the new back-end and new profiling.py print_tips
# @theano.compile.profilemode.register_profiler_printer
def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
apply_time, apply_cimpl, message, outputs_size,
other_time):
@theano.compile.profiling.register_profiler_printer
def profile_printer(message, compile_time, fct_call_time,
apply_time, apply_cimpl, outputs_size, file):
# Scan overhead profile
if any([isinstance(node.op, Scan) and v > 0 for (_, node), v in
if any([isinstance(node.op, Scan) and v > 0 for node, v in
apply_time.items()]):
print()
print('Scan overhead:')
print('', file=file)
print('Scan overhead:', file=file)
print('<Scan op time(s)> <sub scan fct time(s)> <sub scan op '
'time(s)> <sub scan fct time(% scan op time)> <sub scan '
'op time(% scan op time)> <node>')
'op time(% scan op time)> <node>', file=file)
total_super_scan_time = 0
total_scan_fct_time = 0
total_scan_op_time = 0
for (_, node), v in iteritems(apply_time):
if isinstance(node.op, Scan):
for node, v in iteritems(apply_time):
if isinstance(node.op, Scan) and node.op.fn.profile:
if v > 0:
scan_fct_time = node.op.mode_instance.fn_time
scan_op_time = node.op.mode_instance.local_time
scan_fct_time = node.op.fn.profile.call_time
scan_op_time = sum(node.op.fn.profile.apply_time.values())
total_super_scan_time += v
total_scan_fct_time += scan_fct_time
total_scan_op_time += scan_op_time
......@@ -2896,13 +2895,13 @@ def profile_printer(fct_name, compile_time, fct_call_time, fct_call,
scan_fct_time,
scan_op_time,
scan_fct_time / v * 100,
scan_op_time / v * 100), node)
scan_op_time / v * 100), node, file=file)
else:
print((' The node took 0s, so we can not '
'compute the overhead'), node)
print(' total %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % (
'compute the overhead'), node, file=file)
print('total %5.1fs %5.1fs %5.1fs %5.1f%% %5.1f%%' % (
total_super_scan_time,
total_scan_fct_time,
total_scan_op_time,
total_scan_fct_time / total_super_scan_time * 100,
total_scan_op_time / total_super_scan_time * 100))
total_scan_op_time / total_super_scan_time * 100), file=file)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论