提交 c396d611 authored 作者: sentient07's avatar sentient07

Added print_profile to GraphtoGPU

上级 de536bd5
......@@ -3,14 +3,17 @@ import copy
import numpy
import logging
import pdb
import time
from six.moves import xrange
import theano
from theano.compat import OrderedDict
from theano import tensor, scalar, gof, config
from theano.compile import optdb
from theano.compile.ops import shape_i
from theano.gof import (local_optimizer, EquilibriumDB, TopoOptimizer,
SequenceDB, Optimizer, DB, toolbox)
SequenceDB, Optimizer, DB, toolbox, graph)
from gof.opt import ChangeTracker
from theano.gof.optdb import LocalGroupDB
from theano.ifelse import IfElse
......@@ -251,7 +254,7 @@ gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
class GraphToGPU(Optimizer):
class GraphToGPU(Optimizer, NavigatorOptimizer):
"""
Transfer the graph as a whole to GPU instead of transfering node by node.
"""
......@@ -264,8 +267,18 @@ class GraphToGPU(Optimizer):
fgraph.attach_feature(toolbox.ReplaceValidate())
def apply(self, fgraph):
change_tracker = ChangeTracker()
mapping = {}
global_process_count = {}
start_nb_nodes = len(fgraph.apply_nodes)
max_nb_nodes = len(fgraph.apply_nodes)
loop_timing = []
loop_process_count = []
local_opt_timing = []
io_toposort_timing = []
nb_nodes = []
node_created = {}
process_count = {}
# Building a new graph
# Iterating through inputs of graph
for i in fgraph.inputs:
......@@ -277,8 +290,23 @@ class GraphToGPU(Optimizer):
if isinstance(i, theano.Constant):
mapping[i] = i
for lopt in (self.local_optimizers_all +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_map.get(node.op, [])):
process_count.setdefault(copt, 0)
global_process_count.setdefault(opt, 0)
time_opts.setdefault(opt, 0)
node_created.setdefault(opt, 0)
topo_t0 = time.time()
q = deque(graph.io_toposort(fgraph.inputs, start_from))
io_toposort_timing.append(time.time() - topo_t0)
nb_nodes.append(len(q))
max_nb_nodes = max(max_nb_nodes, len(q))
for node in fgraph.toposort():
t0 = time.time()
if isinstance(node.op, HostFromGpu):
mapping[node.outputs[0]] = node.inputs[0]
continue
......@@ -310,7 +338,14 @@ class GraphToGPU(Optimizer):
for lopt in (self.local_optimizers_all +
self.local_optimizers_map.get(type(node.op), []) +
self.local_optimizers_map.get(node.op, [])):
nb = change_tracker.nb_imported
process_count[lopt] += 1
global_process_count[lopt] += 1
t_opt = time.time()
lopt_change = self.process_node(fgraph, node, lopt)
time_opts[lopt] += time.time() - t_opt
node_created[lopt] += change_tracker.nb_imported - nb
if move_to_GPU:
try:
new_ops = lopt.transform(
......@@ -322,6 +357,7 @@ class GraphToGPU(Optimizer):
out_clients)
if new_ops:
break
local_opt_timing.append(float(time.time() - t0))
if not new_ops:
newnode = node.clone_with_new_inputs([mapping.get(i)
for i in node.inputs])
......@@ -344,6 +380,9 @@ class GraphToGPU(Optimizer):
for new_o, old_o in zip(outputs, node.outputs):
mapping[old_o] = new_o
loop_process_count.append(process_count)
loop_timing.append(float(time.time() - t0))
new_nodes = []
for o in fgraph.outputs:
new_o = mapping[o]
......@@ -354,6 +393,143 @@ class GraphToGPU(Optimizer):
new_nodes.append(new_o)
fgraph.replace_all_validate(zip(fgraph.outputs, new_nodes))
@staticmethod
def print_profile(stream, prof, level=0):
(opt, loop_timing, loop_process_count,
(start_nb_nodes, end_nb_nodes, max_nb_nodes),
local_opt_timing, nb_nodes, time_opts, io_toposort_timing,
node_created) = prof
blanc = (' ' * level)
print(blanc, "GraphToGPUOptimizer", end=' ', file=stream)
print(blanc, getattr(opt, "name",
getattr(opt, "__name__", "")), file=stream)
print(blanc, " time %.3fs for %d passes" % (
sum(loop_timing), len(loop_timing)), file=stream)
print(blanc, " nb nodes (start, end, max) %d %d %d" % (
start_nb_nodes, end_nb_nodes, max_nb_nodes), file=stream)
print(blanc, " time io_toposort %.3fs" % sum(
io_toposort_timing), file=stream)
s = sum([time_opts[o] for o in opt.local_optimizers_all])
print(blanc, " time in local optimizers %.3fs" % s, file=stream)
for i in range(len(loop_timing)):
lopt = ""
if loop_process_count[i]:
d = list(reversed(sorted(iteritems(loop_process_count[i]),
key=lambda a: a[1])))
lopt = " ".join([str((str(k), v)) for k, v
in d[:5]])
if len(d) > 5:
lopt += " ..."
print(blanc, (' %2d - %.3fs %d (%.3fs in global opts, '
'%.3fs io_toposort) - %d nodes - %s' % (
i, loop_timing[i],
sum(loop_process_count[i].values()),
local_opt_timing[i],
io_toposort_timing[i], nb_nodes[i],
lopt)), file=stream)
count_opt = []
not_used = []
not_used_time = 0
process_count = {}
for o in (opt.local_optimizers_all +
list(opt.local_optimizers_map.get(type(node.op), [])) +
list(opt.local_optimizers_map.get(node.op, [])) +):
process_count.setdefault(o, 0)
for count in loop_process_count:
for o, v in iteritems(count):
process_count[o] += v
for o, count in iteritems(process_count):
if count > 0:
count_opt.append((time_opts[o], count,
node_created[o], o))
else:
not_used.append((time_opts[o], o))
not_used_time += time_opts[o]
if count_opt:
print(blanc,
' times - times applied - nb node created - name:',
file=stream)
count_opt.sort()
for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]:
if t > 0:
# Skip opt that have 0 times, they probably wasn't even tried.
print(blanc + " ", ' %.3fs - %s' % (t, o), file=stream)
print(file=stream)
@staticmethod
def merge_profile(prof1, prof2):
# (opt, loop_timing, loop_process_count, max_nb_nodes,
# global_opt_timing, nb_nodes, time_opts, io_toposort_timing) = prof1
local_optimizers = OrderedSet(prof1[0].local_optimizers_all).union(
prof2[0].local_optimizers_all)
def merge_dict(d1, d2):
"""
merge 2 dicts by adding the values.
"""
d = d1.copy()
for k, v in iteritems(d2):
if k in d:
d[k] += v
else:
d[k] = v
return d
local_optimizers_map = merge_dict(prof1[0].local_optimizers_map,
prof2[0].local_optimizers_map)
new_opt = GraphToGPU(local_optimizers,local_optimizers_map)
def merge_list(l1, l2):
l = copy.copy(l1)
for idx, nb in enumerate(l2):
if idx < len(l):
l[idx] += nb
else:
l.append(nb)
return l
loop_timing = merge_list(prof1[1], prof2[1])
loop_process_count = list(prof1[2])
for i in range(min(len(loop_process_count), len(prof2[2]))):
process_count = loop_process_count[i]
for process, count in iteritems(prof2[2][i]):
if process in process_count:
process_count[process] += count
else:
process_count[process] = count
loop_process_count.extend(prof2[2][len(loop_process_count):])
max_nb_nodes = max(prof1[3], prof2[3])
nb_nodes = merge_list(prof1[4], prof2[4])
time_opts = merge_dict(prof1[5], prof2[5])
io_toposort_timing = merge_list(prof1[6], prof2[6])
assert len(loop_timing) == max(len(prof1[1]), len(prof2[1]))
node_created = merge_dict(prof1[7], prof2[7])
return (new_opt,
loop_timing,
loop_process_count,
max_nb_nodes,
nb_nodes,
time_opts,
io_toposort_timing,
node_created)
@local_optimizer([GpuFromHost, GpuToGpu, HostFromGpu])
def local_cut_gpu_transfers(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论